|
@inproceedings{,
author={Kocsor, Andr{\'a}s and Kuba Andr{\'a}s Jr. and T{\'o}th L{\'a}szl{\'o}},
title={An Overview of the OASIS speech recognition project},
abstract={This paper presents an overview of the "OASIS" segment-based
speech recognition
project developed at the Research Group on Artificial Intelligence of the
Hungarian Academy of Sciences. The aim of this project is to build a speech
recognizer for Hungarian natural numbers. For this, a traditional spectral
representation is computed first, from which acoustic-phonetic features
are extracted. Some of these, like the energies of certain frequency bands
were chosen in accordance with our knowledge about human auditory processing,
while others, like "sonority" and "voicedness" measure
the supposed acoustic
correlates of these phonetic features. Based on the change of the features
the speech signal is segmented into phonetically stable parts, and so-called
interval-features are calculated over these segments. To each interval-feature
belongs a function called "cue", which gets a segment and a phoneme
as
input and returns a punishment which indicates how probably the segment
can be the given phoneme - according to that interval-feature. The cues
are trained on the distribution of the feature on the database. The punishments
of the several cues are aggregated by the phoneme evaluator, which thus
can compute how well a phoneme fits to an interval (segment or series of
segments). Finally, the matching engine matches all the possible segmentations
to all the possible phoneme strings given by the dictionary, and returns
the one with the smallest punishment as the result of recognition. In the
past year we have examined many possible feature sets. We have also examined
several methods for the phoneme evaluation, e.g. we used the C4.5 system
and also tried an instance-based learning technique. The matching engine
has also developed a lot: here pruning of the search space is crucial for
an acceptable speed, but it is very difficult to find a proper aggregation
and normalization of the punishments which allows the comparison of segment
series of quite different lengths. Currently our system uses a set of 19
features, and gives the best results with the C4.5 evaluator and with the
matching engine that traverses the search space with a backtrack algorithm.
We give a detailed description of all these modules in the paper, and also
present our recognition results on the phonemic and word level, trained
on a database of 26-26 words from 20 talkers. },
booktitle={Proceedings of the 4th International Conference on Applied Informatics},
year={1999},
month={August},
address={Hungary},
editor={Eger-Noszvaj},
pages={94-102}
} |
|