publications

Publications with the keyword: vision and language

View all publications

2021
[14]
Seeing past words: Testing the cross-modal capabilities of pretrained V&L models on counting tasks (; ; and ), In Proceedings of the Workshop Beyond Language: Multimodal Semantic Representations (MMSR'21), .
@inproceedings{counting2021,
	Title = {Seeing past words: Testing the cross-modal capabilities of pretrained V&L models on counting tasks},
	Author = {Parcabalescu, L and Gatt, A and Frank, A and Calixto, I},
	Year = {2021},
	Address = {Groningen, The Netherlands},
	Booktitle = {Proceedings of the Workshop Beyond Language: Multimodal Semantic Representations (MMSR'21)},
	eprinttype = {arxiv},
	eprint={https://arxiv.org/abs/2012.12352},
	Keywords = {vision and language}		
}
[arxiv]
2020
[13]
Gradations of error severity in automatic image description (; ; ; ; ; and ), In Proceedings of the 13th International Conference on Natural Language Genration (INLG'20), Association for Computational Linguistics, .
@inproceedings{inlg2020errorseverity,
	Title = {Gradations of error severity in automatic image description},
	Author = {van Miltenburg, E and Lu, W-T and Krahmer, E and Gatt, A and Chen, G and Li, L and van Deemter, K},
	Booktitle = {Proceedings of the 13th International Conference on Natural Language Genration (INLG'20)},
	Year = {2020},
	Address = {Dublin, Ireland},
	Pages = {398-411},
	Publisher = {Association for Computational Linguistics},	
	Url = {https://www.aclweb.org/anthology/2020.inlg-1.45.pdf},
	Keywords = {natural language generation, vision and language, evaluation}		
}
2019
[12]
Transfer learning from language models to image caption generators: Better models may not transfer better (; and ), arXiv preprint, volume 1901.01216, .
@article{transfer2019,
	Title = {Transfer learning from language models to image caption generators: Better models may not transfer better},
	Author = {Tanti, M and Gatt, A and Camilleri, KP},
	Journal = {arXiv preprint},
	Year = {2019},
	Volume={1901.01216},
	eprinttype={arxiv},
	eprint={https://arxiv.org/pdf/1901.01216.pdf},
	Keywords = {vision and language, natural language generation, image captioning, transfer learning}}
[arxiv]
[11]
Quantifying the amount of visual information used by neural caption generators (; and ), In Computer Vision – ECCV 2018 Workshops: Proceedings of the Workshop on Shortcomings in Vision and Language (Leal-Taixé, L; Roth, S, eds.), Springer, .
@inproceedings{sivl2019sensitivity,
	Author = {Tanti, M and Gatt, A and Camilleri, K},
	Title = {Quantifying the amount of visual information used by neural caption generators},
	Url = {https://link.springer.com/chapter/10.1007/978-3-030-11018-5_11},
	Booktitle = {Computer Vision – ECCV 2018 Workshops: Proceedings of the Workshop on Shortcomings in Vision and Language},
	Year = {2019},
	Editor = {L Leal-Taixé and S Roth},
	Address = {Munich, Germany},
	Publisher = {Springer},
	eprinttype={arxiv},
	eprint={https://arxiv.org/abs/1810.05475},
	Pages = {124-132},
	Doi = {https://doi.org/10.1007/978-3-030-11018-5_11},
	Keywords = {vision and language, image captioning}}
[doi] [arxiv]
[10]
Pre-gen metrics: Predicting caption quality metrics without generating captions (; and ), In Computer Vision – ECCV 2018 Workshops: Proceedings of the Workshop on Shortcomings in Vision and Language (Leal-Taixé, L; Roth, S, eds.), Springer, .
@inproceedings{sivl2019pre-gen,
	Author = {Tanti, M and Gatt, A and Muscat, A},
	Title = {Pre-gen metrics: Predicting caption quality metrics without generating captions},
	Booktitle = {Computer Vision – ECCV 2018 Workshops: Proceedings of the Workshop on Shortcomings in Vision and Language},
	Year = {2019},
	Url = {https://link.springer.com/chapter/10.1007/978-3-030-11018-5_10},
	Address = {Munich, Germany},
	Publisher = {Springer},
	Pages = {114-123},
	Editor = {L Leal-Taixé and S Roth},
	Doi = {https://doi.org/10.1007/978-3-030-11018-5_10},
	eprinttype={arxiv},
	eprint={https://arxiv.org/abs/1810.05474},
	Keywords = {vision and language, image captioning}}
[doi] [arxiv]
[9]
Visually Grounded Generation of Entailments from Premises (; and ), In Proceedings of the 12th International Conference on Natural Language Generation (INLG'19), Association for Computational Linguistics, .
@inproceedings{inlg2019entailment,
	Author = {Jafaritazehjani, S and Gatt, A and Tanti, M},
	Title = {Visually Grounded Generation of Entailments from Premises},
	Booktitle = {Proceedings of the 12th International Conference on Natural Language Generation (INLG'19)},
	Year = {2019},
	Address = {Tokyo, Japan},
	Publisher = {Association for Computational Linguistics},
	eprinttype = {arxiv},
	eprint={https://arxiv.org/abs/1909.09788},
	Url = {https://www.aclweb.org/anthology/W19-8625/},
	Keywords = {vision and language, textual entailment, natural language generation}
}
[arxiv]
2018
[8]
Where to put the image in an image caption generator. (; and ), Natural Language Engineering, volume 24, .
@article{tanti2017image,
	Author = {Tanti, M and Gatt, A and Camilleri, K},
	Journal = {Natural Language Engineering},
	Pages = {467-489},
	Title = {Where to put the image in an image caption generator.},
	Year = {2018},
	Volume = {24},
	Issue = {3},
	eprinttype  = {arxiv},
  	eprint      = {https://arxiv.org/abs/1703.09137},
  	Url = {https://www.cambridge.org/core/journals/natural-language-engineering/article/where-to-put-the-image-in-an-image-caption-generator/A5B0ACFFE8E4AEAA5840DC61F93153F3},
  	Doi = {10.1017/S1351324918000098},
  	Keywords = {vision and language, natural language generation, image captioning}}
[doi] [arxiv]
[7]
Face2Text: Collecting an Annotated Image Description Corpus for the Generation of Rich Face Descriptions (; ; ; ; ; ; ; and ), In Proceedings of the 11th edition of the Language Resources and Evaluation Conference (LREC'18), .
@inproceedings{lrec2018,
	author = {Gatt, A and Tanti, M and Muscat, A and Paggio, P and Farrugia, R and Borg, C and Camilleri, K and Rosner, M and van der Plas, L},
	year = {2018},
	title = {Face2Text: Collecting an Annotated Image Description Corpus for the Generation of Rich Face Descriptions},
	booktitle = {Proceedings of the 11th edition of the Language Resources and Evaluation Conference (LREC'18)},
	Url = {http://www.lrec-conf.org/proceedings/lrec2018/pdf/226.pdf},
	eprinttype={arxiv},
	eprint={https://arxiv.org/abs/1803.03827},
	keywords={vision and language,image captioning,face description}}
[arxiv]
[6]
Predicting visual spatial relations in the Maltese language ( and ), In Breaking Barriers: Junior College Multidisciplinary Conference, University of Malta Junior College, .
@inproceedings{jc2019-prepositions,
	Author = {Muscat, A and Gatt, A},
	Year = {2018},
	Title = {Predicting visual spatial relations in the {M}altese language},
	Booktitle = {Breaking Barriers: Junior College Multidisciplinary Conference},
	Pages = {414-450},
	Address = {Malta},
	Publisher = {University of Malta Junior College},
	Url = {https://staff.um.edu.mt/albert.gatt/pubs/jc2018-prepositions.pdf},
	Keywords = {vision and language, maltese, image captioning, spatial relations}
}
[5]
Grounded textual entailment (; ; ; ; ; ; ; and ), In Proceedings of the 27th International Conference on Computational Linguistics (COLING'18), Association for Computational Linguistics, .
@inproceedings{coling2018,
	author = {Vutrong, H and Greco, C and Erofeeva, A and Jafaritazehjani, S and Linders, G and Tanti, M and Testoni, A and Bernardi, R and Gatt, A},
	year = {2018},
	title = {Grounded textual entailment},
	Url = {http://aclweb.org/anthology/C18-1199},
	booktitle = {Proceedings of the 27th International Conference on Computational Linguistics (COLING'18)},
	address={Sante Fe, New Mexico},
	publisher = {Association for Computational Linguistics},
	pages={2354-2368},
	eprinttype={arxiv},
	eprint={https://arxiv.org/abs/1806.05645},
	keywords={vision and language,textual entailment}}
[arxiv]
2017
[4]
What is the Role of Recurrent Neural Networks (RNNs) in an Image Caption Generator? (; and ), In Proceedings of the 10th International Conference on Natural Language Generation (INLG'17), Association for Computational Linguistics, .
@inproceedings{inlg2017,
	Address = {Santiago de Compostela, Spain},
	Author = {Tanti, M and Gatt, A and Camilleri, K},
	Booktitle = {Proceedings of the 10th International Conference on Natural Language Generation (INLG'17)},
	Publisher = {Association for Computational Linguistics},
	Url = {http://aclweb.org/anthology/W/W17/W17-3506.pdf},	
	Title = {What is the Role of Recurrent Neural Networks (RNNs) in an Image Caption Generator?},
	Year = {2017},
	Eprinttype = {arxiv},
	Eprint  = {https://arxiv.org/abs/1708.02043},
	Keywords = {vision and language, natural language generation, image captioning}}
[arxiv]
[3]
Reference Production as Search: The Impact of Domain Size on the Production of Distinguishing Descriptions (; ; and ), Cognitive science, volume 41, .
@article{gatt2016reference,
	Author = {Gatt, A and Krahmer, E and van Deemter, K and van Gompel, RPG},
	Journal = {Cognitive science},
	Title = {Reference Production as Search: The Impact of Domain Size on the Production of Distinguishing Descriptions},
	Url = {http://onlinelibrary.wiley.com/doi/10.1111/cogs.12375/abstract},
	eprinttype = {preprint},
	eprint = {./pubs/cognitive-science_distractor-set-size.pdf},
	Volume = {41},
	Number={S6},
	Doi = {10.1111/cogs.12375},
	Pages = {1457--1492},
	Year = {2017},
	Keywords = {natural language generation, language production, psycholinguistics, cognitive modelling, referring expressions, overspecification, vision and language}}
[doi] [preprint]
2016
[2]
Viewing time affects overspecification: Evidence for two strategies of attribute selection during reference production (; ; ; and ), In Proceedings of the 38th Annual Meeting of the Cognitive Science Society (CogSci'16), Cognitive Science Society, .
@inproceedings{koolen2016viewing,
	Address = {Austin, TX},
	Author = {Koolen, Ruud and Gatt, Albert and van Gompel, Roger PG and Krahmer, Emiel and van Deemter, Kees},
	Booktitle = {Proceedings of the 38th Annual Meeting of the Cognitive Science Society (CogSci'16)},
	Publisher = {Cognitive Science Society},
	Title = {Viewing time affects overspecification: Evidence for two strategies of attribute selection during reference production},
	Url = {https://staff.um.edu.mt/albert.gatt/pubs/cogsci2016-reference-under-load.pdf},
	Year = {2016},
	Keywords = {language production, psycholinguistics, cognitive modelling, referring expressions, overspecification, vision and language}}
2013
[1]
Production of referring expressions: Preference trumps discrimination. (; ; and ), In Proceedings of the 35th Annual Meeting of the Cognitive Science Society (CogSci'13), Cognitive Science Societyg, .
@inproceedings{gatt2013production,
	Address = {Austin, TX},
	Author = {Gatt, A and Krahmer, E and van Gompel, RPG and van Deemter, K},
	Booktitle = {Proceedings of the 35th Annual Meeting of the Cognitive Science Society (CogSci'13)},
	Publisher = {Cognitive Science Societyg},
	Title = {Production of referring expressions: Preference trumps discrimination.},
	Url = {https://staff.um.edu.mt/albert.gatt/pubs/cogsci2013-disc-power.pdf},
	Year = {2013},
	Keywords = {psycholinguistics, language production, referring expressions, overspecification, vision and language}}