@inproceedings{0397e052fd404587a44f5647ea6e75ec,
title = "Cross-Modal Event Retrieval: A Dataset and a Baseline Using Deep Semantic Learning",
abstract = "In this paper, we propose to learn Deep Semantic Space (DSS) for cross-modal event retrieval, which is achieved by exploiting deep learning models to extract semantic features from images and textual articles jointly. More specifically, a VGG network is used to transfer deep semantic knowledge from a large-scale image dataset to the target image dataset. Simultaneously, a fully-connected network is designed to model semantic representation from textual features (e.g., TF-IDF, LDA). Furthermore, the obtained deep semantic representations for image and text can be mapped into a high-level semantic space, in which the distance between data samples can be measured straightforwardly for cross-model event retrieval. In particular, we collect a dataset called Wiki-Flickr event dataset for cross-modal event retrieval, where the data are weakly aligned unlike image-text pairs in the existing cross-modal retrieval datasets. Extensive experiments conducted on both the Pascal Sentence dataset and our Wiki-Flickr event dataset show that our DSS outperforms the state-of-the-art approaches.",
keywords = "Common space, Cross-modal event retrieval, Deep learning",
author = "Runwei Situ and Zhenguo Yang and Jianming Lv and Qing Li and Wenyin Liu",
year = "2018",
month = sep,
doi = "10.1007/978-3-030-00767-6_14",
language = "English",
isbn = "978-3-030-00766-9",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "147--157",
editor = "Hong, {Richang } and Cheng, {Wen-Huang } and Toshihiko Yamasaki and Wang, {Meng } and Ngo, {Chong-Wah }",
booktitle = "Advances in Multimedia Information Processing – PCM 2018",
address = "Germany",
note = "19th Pacific-Rim Conference on Multimedia (PCM 2018) ; Conference date: 21-09-2018 Through 22-09-2018",
}