Bibtex
Content Classification of Multimedia Documents using Partitions of Low-Level Features
@article{lk06,
author="Edda Leopold and J\"{o}rg Kindermann",
title="{C}ontent {C}lassification of {M}ultimedia {D}ocuments using {P}artitions of {L}ow-{L}evel {F}eatures",
journal="Journal of Virtual Reality and Broadcasting",
editor = "Jens Herder",
year="2006",
volume="3",
number="6",
month=jan,
note="{\tt urn:nbn:de:0009-6-7607}, ISSN 1860-2037",
abstract = "Audio-visual documents obtained from German TV
news are classified according to the IPTC topic categorization
scheme. To this end usual text classification
techniques are adapted to speech, video, and
non-speech audio. For each of the three modalities
word analogues are generated: sequences of syllables
for speech, “video words” based on low level color
features (color moments, color correlogram and color
wavelet), and “audio words” based on low-level spectral
features (spectral envelope and spectral flatness)
for non-speech audio. Such audio and video words
provide a means to represent the different modalities in
a uniform way. The frequencies of the word analogues
represent audio-visual documents: the standard bag-
of-words approach. Support vector machines are used
for supervised classification in a 1 vs. n setting. Classification
based on speech outperforms all other single
modalities. Combining speech with non-speech audio
improves classification. Classification is further improved
by supplementing speech and non-speech audio
with video words. Optimal F-scores range between
62% and 94% corresponding to 50% - 84% above
chance. The optimal combination of modalities depends
on the category to be recognized. The construction
of audio and video words from low-level features
provide a good basis for the integration of speech, nonspeech
audio and video."
}
