@inproceedings{4697e155bd7f407cac3c66052ac12e88,
title = "Video captioning based on joint image-audio deep learning techniques",
abstract = "With the advancement in technology, deep learning has been widely used for various multimedia applications. Herein, we utilized this technology to video captioning. The proposed system uses different neural networks to extract features from image, audio, and semantic signals. Image and audio features are concatenated before being fed into a long short-term memory (LSTM) for initialization. The joint audio-image features help the entire semantics to form a network with better performance.A bilingual evaluation understudy algorithm (BLEU) - an automatic speech scoring mechanism - was used to score sentences. We considered the length of the word group (one word to four words); with the increase of all BLEU scores by more than 1%, the CIDEr-D score increased by 2.27%, and the METEOR and ROUGE-L scores increased by 0.2% and 0.7%, respectively. The improvement is highly significant.",
keywords = "Acoustic scene classification, Convolutional neural networks, Long short-term memory, Sound event detection, Video captioning, Word embedding",
author = "Wang, {Chien Yao} and Liaw, {Pei Sin} and Liang, {Kai Wen} and Wang, {Jai Ching} and Chang, {Pao Chi}",
note = "Publisher Copyright: {\textcopyright} 2019 IEEE.; 9th IEEE International Conference on Consumer Electronics, ICCE-Berlin 2019 ; Conference date: 08-09-2019 Through 11-09-2019",
year = "2019",
month = sep,
doi = "10.1109/ICCE-Berlin47944.2019.8966173",
language = "???core.languages.en_GB???",
series = "IEEE International Conference on Consumer Electronics - Berlin, ICCE-Berlin",
publisher = "IEEE Computer Society",
pages = "127--131",
editor = "Gordan Velikic and Christian Gross",
booktitle = "Proceedings - 2019 IEEE 9th International Conference on Consumer Electronics, ICCE-Berlin 2019",
}