Publications


see also: Google Scholar, Scopus page, or Loop Profile

@INPROCEEDINGS{Grosser10:WMHH, author = {Grosser, Tobias and Heine,Viktoria and Gl\"uge, Stefan and Siegert, Ingo and Frommer, J\"org and Wendemuth, Andreas }, title = {Artitificial Intelligent Systems and Cognition}, booktitle = {Proceedings of 1st International Conference on What makes Humans Human}, year = {2010}, pages = {5 pages}, address = {Ulm, Germany}, abstract = {In the last decades tremendous improvements in accoustic modelling of speech for automatic speech recognition were made. Nonetheless, the interaction between humans and computer systems via an automatic speech recogntion interface still regulary leads to unsatisfied users. In this paper recordings of 3 simulated group meetings with limited vocabulary are analysed from two perspectives to find starting points to overcome this problem. One perspective is given by a computer engineer and the other by a sociologist. Our intention is to provide insights into the dynamics of group meetings which might lead to more robust and adaptive Automatic Speech Recognition systems and to enhanced sociological understanding of group situations from a different point of view.}, keywords={kongressproc,kongress} }

@InProceedings{SCHERER10:LREC, author = {Scherer, Stefan and Siegert, Ingo and Bigalke, Lutz and Meudt, Sascha}, title = {Developing an Expressive Speech Labeling Tool Incorporating the Temporal Characteristics of Emotion}, booktitle = {Proc. of the Seventh International Conference on Language Resources and Evaluation (LREC'10)}, year = {2010}, address = {Valletta, Malta}, editor = {Nicoletta Calzolari and Khalid Choukri and Bente Maegaard and Joseph Mariani and Jan Odijk and Stelios Piperidis and Mike Rosner and Daniel Tapias}, publisher = {ELRA}, isbn = {2-9517408-6-7}, abstract = {A lot of research effort has been spent on the development of emotion theories and modeling, however, their suitability and applicability to expressions in human computer interaction has not exhaustively been evaluated. Furthermore, investigations concerning the ability of the annotators to map certain expressions onto the developed emotion models is lacking proof. The proposed annotation tool, which incorporates the standard Geneva Emotional Wheel developed by Klaus Scherer and a novel temporal characteristic description feature, is aiming towards enabling the annotator to label expressions recorded in human computer interaction scenarios on an utterance level. Further, it is respecting key features of realistic and natural emotional expressions, such as their sequentiality, temporal characteristics, their mixed occurrences, and their expressivity or clarity of perception. Additionally, first steps towards evaluating the proposed tool, by analyzing utterance annotations taken from two expressive speech corpora, are undertaken and some future goals including the open source accessibility of the tool are given.}, url={http://www.lrec-conf.org/proceedings/lrec2010/pdf/101_Paper.pdf}, keywords={conferenceproc,conference} }

@INCOLLECTION{Boeck2011ACII, author = {B{\"o}ck, Ronald and Siegert, Ingo and Haase, Matthias and Lange, Julia and Wendemuth, Andreas}, title = {ikannotate - A Tool for Labelling, Transcription, and Annotation of Emotionally Coloured Speech}, booktitle = {Affective Computing and Intelligent Interaction.}, publisher = {Springer Berlin, Heidelberg}, year = {2011}, editor = {D'Mello, Sidney and Graesser, Arthur and Schuller, Bj{\"o}rn and Martin, Jean-Claude}, volume = {6974}, series = {Lecture Notes in Computer Science}, pages = {25--34}, doi = {10.1007/978-3-642-24600-5}, abstract = {In speech recognition and emotion recognition from speech, qualitatively high transcription and annotation of given material is important. To analyse prosodic features, linguistics provides several transcription systems. Furthermore, in emotion labelling different methods are proposed and discussed. In this paper, we introduce the tool ikannotate, which combines prosodic information with emotion labelling. It allows the generation of a transcription of material directly annotated with prosodic features. Moreover, material can be emotionally labelled according to Basic Emotions, the Geneva Emotion Wheel, and Self Assessment Manikins. Finally, we present results of two usability tests observing the ability to identify emotions in labelling and comparing the transcription tool “Folker” with our application.}, keywords={conferencebook,conference} }

@inproceedings{SiegertICME2011, author = {Siegert, Ingo and B{\"o}ck, Ronald and Philippou-H{\"u}bner, David and Vlasenko, Bogdan and Wendemuth, Andreas}, title = {{Appropriate Emotional Labeling of Non-acted Speech Using Basic Emotions, Geneva Emotion Wheel and Self Assessment Manikins}}, booktitle = {Proceedings of the 2011 IEEE International Conference on Multimedia \& Expo}, year = {2011}, address = {Barcelona, Spain}, month = {07}, publisher = {IEEE}, doi = {10.1109/ICME.2011.6011929}, abstract = {The detection of user emotions play an important role in human-machine-interaction. By incorporating emotion recognition, applications such as monitoring agents or digital companions can be improved. The recognition result is used to model the state of the user. This leads to an adaptive reaction of the system towards users needs and claims and thus to an enhanced system reaction. Besides emotions, based on short events, personality and moods also play an important role. Standard emotion recogniser do not consider them adequately and therefore neglect a crucial part of user modelling. The challenge is to gather a reliable prediction about the actual mood of the user and moreover represent changes in users' mood during interaction. This includes both the different user-dispositions and the observation of the inner user-state. Therefore conclusions from external conditions has to be drawn. For this we suggest a model with few parameters that, based on a prevailing mood represents the actual mood of an user. Thereby recognised emotions are modelled as forces, appealing on the mood and shifting it within the VAD-space. Additionally by implementing a repulsing force a decaying mood can be modelled. Thereby it is possible to simulate the short-time effects and the volatility of emotions and also the slow and continuous development of moods. Furthermore the model is able to incorporate different traits of personality. The prevailing mood, representing a certain user-disposition, will be shifted within the VAD-space. In Addition, recognised emotions are assigned to different forces, by what external observations according to personality-traits can represent different inner user-states.}, url ={pdf/2011_icme_preprint.pdf}, keywords={conferenceproc,conference} }

@inproceedings{VlasenkoICME2011, author = {Vlasenko, Bogdan and Philippou-H{\"u}bner, David and Prylipko, Dmytro and B{\"o}ck, Ronald and Siegert, Ingo and Wendemuth, Andreas}, title = {{Vowels Formants Analysis Allows Straightforward Detection of High Arousal Emotions}}, booktitle = {Proceedings of the 2011 IEEE International Conference on Multimedia \& Expo}, year = {2011}, address = {Barcelona, Spain}, month = {07}, publisher = {IEEE}, doi = {10.1109/ICME.2011.6012003}, abstract = {Recently, automatic emotion recognition from speech has achieved growing interest within the human-machine interaction research community. Most part of emotion recognition methods use context independent frame-level analysis or turn-level analysis. In this article, we introduce context dependent vowel level analysis applied for emotion classification. An average first formant value extracted on vowel level has been used as unidimensional acoustic feature vector. The Neyman-Pearson criterion has been used for classification purpose. Our classifier is able to detect high-arousal emotions with small error rates. Within our research we proved that the smallest emotional unit should be the vowel instead of the word. We find out that using vowel level analysis can be an important issue during developing a robust emotion classifier. Also, our research can be useful for developing robust affective speech recognition methods and high quality emotional speech synthesis systems.}, keywords={conferenceproc,conference} }

@INPROCEEDINGS{Boeck2011ICME, author = {B\"{o}ck,Ronald and Siegert,Ingo and Vlasenko,Bogdan and Wendemuth,Andreas and Haase,Matthias and Lange,Julia }, title = {A Processing Tool for Emotionally Coloured Speech}, booktitle = {Proceedings of the 2011 IEEE International Conference on Multimedia \& Expo}, year = {2011}, address = {Barcelona, Spain}, month = {07}, publisher = {IEEE}, abstract = {In speech recognition and emotion recognition from speech, qualitatively high transcription and annotation of given material is important. To analyse prosodic feature s, linguistics provides several transcription systems. Furthermore, in emotion labelling different methods are proposed and discussed. In this paper, we introduce the tool ikannotate, which combines prosodic information with emotion labelling. It allows the generation of a transcription of material directly annotated with prosodic features. Moreover, material can be emotionally labelled according to Basic Emotions, the Geneva Emotion Wheel, and Self Assessment Manikins. Finally, we present results of two usability tests observing the ability to identify emotions in labelling and comparing the transcription tool “Folker” with our application.}, keywords={conferenceproc,conference} }

@inproceedings{SiegertCOST2011, author = {Siegert, Ingo and Böck, Ronald and Wendemuth, Andreas}, title = {Incorporation of a mood-model to improve user-disposition prediction from emotion recognition}, booktitle = {Program and Abstracts of the COST 2102 Final Conference held in conjunction with the 4th COST 2102 International Training School on COGNITIVE BEHAVIOURAL SYSTEMS}, year = {2011}, address = {Dresden, Gemany}, month = {02}, publisher = { Dresden : Inst. für Akustik und Sprachkommunikation }, isbn = { 978-3-86780-219-2}, keywords={conferenceproc,conference} }

@INCOLLECTION{Siegert2012273, author={Siegert, Ingo and Böck, Ronald and Wendemuth, Andreas}, title={Modeling users' mood state to improve human-machine-interaction}, year={2012}, volume={7403}, pages={273--279}, doi={10.1007/978-3-642-34584-5_23}, booktitle = {Cognitive Behavioural Systems}, publisher = {Springer Berlin, Heidelberg}, editor = {A. Esposito and A. M. Esposito and A. Vinciarelli and R. Hoffmann and V. C. Müller }, series = {Lecture Notes in Computer Science}, abstract = {

The detection of user emotions plays an important role in Human-Machine-Interaction. By considering emotions, applications such as monitoring agents or digital companions are able to adapt their reaction towards users’ needs and claims. Besides emotions, personality and moods are eminent as well. Standard emotion recognizers do not consider them adequately and therefore neglect a crucial part of user modeling.

The challenge is to gather reliable predictions about the actual mood of the user and, beyond that, represent changes in users’ mood during interaction. In this paper we present a model that incorporates both the tracking of mood changes based on recognized emotions and different personality traits. Furthermore we present a first evaluation on realistic data. }, url={pdf/2012_cost_preprint.pdf}, keywords={buch} }

@InProceedings{Siegert:MA3, author = {Siegert, Ingo and B{\"o}ck, Ronald and Wendemuth, Andreas}, title = {The Influence of Context Knowledge for Multimodal Annotation on natural Material}, booktitle = {Joint Proceedings of the IVA 2012 Workshops}, address = {Santa Cruz, USA}, editor = {Ronald B{\"o}ck and Francesca Bonin and Nick Campbell and Jens Edlund and {de Kok}, Iwan and Ronald Poppe and David Traum}, month = {09}, publisher = {Otto von Guericke University Magdeburg}, isbn= {978-3-940961-83-9}, year = {2012}, pages = {25--32}, abstract = {In emotion recognition from speech, a good transcription and annotation of given material is crucial. Moreover, the question of how to find good emotional labels for new data material is a basic issue. An important question is how the context influences the decision of the annotator. In this paper, we present our investigations for emotional labelling on natural multimodal data with and without the computer's responses as one main context-information within an natural human computer interaction. We show that for emotional labels the computer's responses did not influence the decision of the annotators.}, url={pdf/2012_ma3_preprint.pdf}, keywords={conferenceproc,conference} }

@Inproceedings{ESSV_IS:2012, title = {Investigation of Hierarchical Classification for Simultaneous Gender and Age Recognitions}, Address = {Leipzig, Germany}, author = {Siegert, Ingo and Böck,Ronald and Philippou-Hübner,David and Wendemuth,Andreas}, Booktitle = {Elektronische Sprachsignalverarbeitung 2012. Tagungsband der 23. Konferenz}, pages = {58--65}, Year = {2016}, series = {Studientexte zur Sprachkommunikation}, editor = {Matthias Wolff}, publisher = {TUDpress}, volume = {64}, abstract = {For a successful speech-controled human-machine-interaction individualized models are needed. If the system is designed to run with many users for short times each, a complete user adaptation is not useful. A possible solution would be to use user-group pre-adapted models and recognize the group the actual speaker belongs to in the very first beginning of the interaction. In this paper we present investigate different methods to recognize age and gender groups with an hierarchical model to improve the recognition rate. We could prove, that our method could get adequate results on a four class problem compared with classical approaches. }, keywords={kongressbook,kongress} }

@InProceedings{ESSV_RB:2012, author = { Böck,Ronald and Limbrecht,Kerstin and Siegert, Ingo and Glüge,Stefan and Walter,Steffen and Wendemuth,Andreas}, title = {Combining Mimic and Prosodic Analyses for User Disposition Classification}, booktitle = {Elektronische Sprachsignalverarbeitung 2012. Tagungsband der 23. Konferenz}, year = {2012}, pages = {220--227}, address = {Cottbus, Germany}, series = {Studientexte zur Sprachkommunikation}, editor = {Matthias Wolff}, publisher = {TUDpress}, volume = {64}, abstract = {Automatic classification of the users’ internal affective and emotional states is to be considered for many applications, ranging from organisational tasks to health care. To develop automatic technical systems suitable training material is necessary and an appropriate adaptation towards users is needed. In this work, we present preliminary but promising results of our research focusing on emotion classification by visual and audio signals. This is related to a semi-automatic and crossmodality labelling of data sets which will help to establish a kind of ground truth for labels in the adaptation process of classifiers. In our experiments we showed that prosodic features, especially, higher order ones like formant’s three bandwidth are related to visual/mimic expressions. }, keywords={kongressbook,,kongress} }

@InProceedings{ISPCC:2012, author = {Panning,Axel and Siegert, Ingo and Al-Hamadi,Ayoub and Wendemuth,Andreas and R{\"o}sner,Dietmar and Frommer, Jörg and Krell,Gerald and Michaelis,Bend }, title = {Multimodal Affect Recognition in Spontaneous HCI Environment}, booktitle = {Proceedings of 2012 IEEE International Conference on Signal Processing, Communications and Computing (ICSPCC)}, year = {2012}, month = {08}, address ={Hong Kong, China}, pages = {430--435}, doi = {10.1109/ICSPCC.2012.6335662}, abstract = {Human Computer Interaction (HCI) is known to be a multimodal process. In this paper we will show results of experiments for affect recognition, with non-acted, affective multimodal data from the new Last Minute Corpus (LMC). This corpus is more related to real HCI applications than other known data sets where affective behavior is elicited untypically for HCI.We utilize features from three modalities: facial expressions, prosody and gesture. The results show, that even simple fusion architectures can reach respectable results compared to other approaches. Further we could show, that probably not all features and modalities contribute substantially to the classification process, where prosody and eye blink frequency seem most contributing in the analyzed dataset.}, keywords={conferenceproc,conference} }

@InProceedings{FROMMER12.782, author = {Frommer,Jörg and Michaelis,Bernd and Rösner,Dietmar and Wendemuth,Andreas and Friesen,Rafael and Haase,Matthias and Kunze,Manuela and Andrich,Rico and Lange,Julia and Panning ,Axel and Siegert, Ingo}, title = {{Towards Emotion and Affect Detection in the Multimodal LAST MINUTE Corpus}}, booktitle = {Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)}, year = {2012}, month = {05}, pages = {3064--3069}, address = {Istanbul, Turkey}, editor = {Nicoletta Calzolari and Khalid Choukri and Thierry Declerck and Mehmet Uğur Doğan and Bente Maegaard and Joseph Mariani and Jan Odijk and Stelios Piperidis}, publisher = {ELRA}, isbn = {978-2-9517408-7-7}, abstract = {The LAST MINUTE corpus comprises multimodal recordings (e.g. video, audio, transcripts) from WOZ interactions in a mundane planning task (Rösner et al., 2011). It is one of the largest corpora with naturalistic data currently available. In this paper we report about first results from attempts to automatically and manually analyze the different modes with respect to emotions and affects exhibited by the subjects. We describe and discuss difficulties encountered due to the strong contrast between the naturalistic recordings and traditional databases with acted emotions.}, url={http://www.lrec-conf.org/proceedings/lrec2012/pdf/782_Paper.pdf}, keywords={conferenceproc,conference}

}

@InProceedings{Kotzyba2012, author = { Kotzyba,Michael and Deml,Barbara and Neumann,Hendrik and Glüge,Stefan and Hartmann,Kim and Siegert, Ingo and Wendemuth,Andreas and Traue,Harald and Walter,Steffen }, title = {Emotion Detection by Event Evaluation using Fuzzy Sets as Appraisal Variables}, booktitle = {Proceedings of the 11th International Conference on Cognitive Modeling (ICCM 2012)}, year = {2012}, month = {04}, pages = {123--124}, address = {Berlin, Germany}, editor = {N. Rußwinkel and U. Drewitz and H. van Rijn}, publisher = {Universitaetsverlag der TU Berlin}, abstract = {A very important and challenging task in cognitive science is the detection and modeling of human emotions. On the one hand, computers could benefit from that, because emotions play a significant role in rational decision making, perception, learning, and a variety of other important cognitive functions. On the other hand there is a need for genuinely intelligent computers that adapt and interact with human users in a natural way. To achieve this goal, computers need the ability to recognize and to express emotions.}, keywords={conferenceproc,conference} }

@InProceedings{HARTMANN12, author = { Hartmann,Kim and Siegert, Ingo and Glüge,Stefan and Wendemuth,Andreas and Kotzyba,Michael and Deml,Barbara }, title = {Describing Human Emotions Through Mathematical Modelling}, booktitle = {Proceedings of the MATHMOD 2012 - 7th Vienna International Conference on Mathematical Modelling}, year = {2012}, month = {02}, address = {Vienna, Austria},
doi = {10.3182/20120215-3-AT-3016.00081}, abstract = {To design a companion technology we focus on the appraisal theory model to predict emotions and determine the appropriate system behaviour to support Human-Computer-Interaction. Until now, the implementation of emotion processing was hindered by the fact that the theories needed originate from diverging research areas, hence divergent research techniques and result representations are present. Since this difficulty arises repeatedly in interdisciplinary research, we investigated the use of mathematical modelling as an unifying language to translate the coherence of appraisal theory. We found that the mathematical category theory supports the modelling of human emotions according to the appraisal theory model and hence assists the implementation.}, keywords={conferenceproc,conference} }

@INPROCEEDINGS{6617458, author={Siegert, Ingo and Glodek, Michael and Panning, Axel and Krell, Gerald and Schwenker, Friedhelm and Al-Hamadi, Ayoub and Wendemuth, Andreas}, booktitle={IEEE International Conference on Cybernetics (CYBCONF)}, title={Using speaker group dependent modelling to improve fusion of fragmentary classifier decisions}, year={2013}, pages={132--137}, month = {06}, address ={Lausanne Switzerland}, abstract={Current speech-controlled human computer interaction is purely based on spoken information. For a successful interaction, additional information such as the individual skills, preferences and actual affective state of the user are often mandatory. The most challenging of these additional inputs is the affective state, since affective cues are in general expressed very sparsely. The problem can be addressed in two ways. On the one hand, the recognition can be enhanced by making use of already available individual information. On the other hand, the recognition is aggravated by the fact that research is often limited to a single modality, which in real-life applications is critical since recognition may fail in case sensors do not perceive a signal. We address the problem by enhancing the acoustic recognition of the affective state by partitioning the user into groups. The assignment of a user to a group is performed at the beginning of the interaction, such that subsequently a specialized classifier model is utilized. Furthermore, we make use of several modalities, acoustics, facial expressions, and gesture information. The combination of decisions not affected by sensor failures from these multiple modalities is achieved by a Markov Fusion Network. The proposed approach is studied empirically using the LAST MINUTE corpus. We could show that compared to previous studies a significant improvement of the recognition rate can be obtained.}, doi={10.1109/CYBConf.2013.6617458}, url ={pdf/2013_cybconf_preprint.pdf}, keywords={conferenceproc,conference} }

@incollection{HBU2013, year={2013}, isbn={978-3-319-02713-5}, booktitle={Human Behavior Understanding}, volume={8212}, series={Lecture Notes in Computer Science}, editor={Salah, AlbertAli and Hung, Hayley and Aran, Oya and Gunes, Hatice}, doi={10.1007/978-3-319-02714-2_21}, title={Human Behaviour in HCI: Complex Emotion Detection through Sparse Speech Features}, publisher={Springer International Publishing}, author={Siegert, Ingo and Hartmann, Kim and Philippou-Hübner, David and Wendemuth, Andreas}, pages={246-257}, abstract = {

To obtain a more human-like interaction with technical systems, those have to be adaptable to the users’ individual skills, preferences, and current emotional state. In human-human interaction (HHI) the behaviour of the speaker is characterised by semantic and prosodic cues, given as short feedback signals. These signals minimally communicate certain dialogue functions such as attention, understanding, confirmation, or other attitudinal reactions. Thus, these signals play an important role in the progress and coordination of interaction. They allow the partners to inform each other of their behavioural or affective state without interrupting the ongoing dialogue.

Vocal communication provides acoustic details revealing the speaker’s feelings, believes, and social relations. Incorporating discourse particles (DPs) in human-computer interaction (HCI) systems will allow the detection of complex emotions, which are currently hard to access. Complex emotions in turn are closely related to human behaviour. Hence, integrating automatic DP detection and complex emotion assignment in HCI systems provides a first approach to the integration of human behaviour understanding in HCI systems.

In this paper we present methods allowing to extract the pitch-contour of DPs and to assign complex emotions to observed DPs. We investigate the occurrences of DPs in naturalistic HCI and show that DPs may be assigned to complex emotions automatically. Furthermore, we show that DPs are indeed related to behaviour, showing an age-gender specific usage during naturalistic HCI. Additionally, we prove that DPs may be used to automatically detect and classify complex emotions during HCI. }, url ={pdf/2013_hbu_preprint.pdf}, keywords={conferencebook,conference} }

@BOOK{T2CTCCGL13, Author = {Böck,Ronald and Degens,Nick and Heylen,Dirk and Louchart,Sandy and Minker,Wolfgang and Morency,Louis-Philippe and Nazir,Asad and Schwenker,Friedhelm and Siegert,Ingo }, TITLE = {Joint Proceedings of the 2013 T2CT and CCGL Workshops}, PUBLISHER = {Otto von Guericke University Magdeburg}, YEAR = {2013}, Address = {Magdeburg}, ISBN = {978-3-940961-99-9}, keywords={heraus} }

@InProceedings{Boeck:ACII13, author = {Böck,Ronald and Glüge,Stefan and Siegert,Ingo and Wendemuth,Andreas}, title = {Annotation and Classification of Changes of Involvement in Group Conversation}, booktitle = {Proceedings of the 2013 Humaine Association Conference on Affective Computing and Intelligent Interaction (ACII 2013)}, address = {Geneve, Switzerland}, month = {09}, year = {2013}, pages = {803--808}, note ={This publication is presented at the ACII 2013's satellite workshop CBAR 2013}, abstract = {The detection of involvement in a conversation is important to assess the level humans are participating in either a human-human or human-computer interaction. Especially, detecting changes in a group’s involvement in a multi-party interaction is of interest to distinguish several constellations in the group itself. This information can further be used in situations where technical support of meetings is favoured, for instance, focusing a camera, switching microphones, etc. Moreover, this information could also help to improve the performance of technical systems applied in human-machine interaction. In this paper, we concentrate on video material given by the TableTalk corpus. Therefore, we introduce a way of annotating and classifying changes of involvement and discuss the reliability of the annotation. Further, we present classification results based on video features using Multi-Layer Networks.}, keywords={conferenceproc,conference}

}

@incollection{IFACK_H_13, author = {Hartmann, Kim and Siegert, Ingo and Philippou-Hübner, David and Wendemuth, Andreas}, title = {Emotion Detection in HCI: From Speech Features to Emotion Space}, booktitle = {Analysis, Design, and Evaluation of Human-Machine Systems}, volume={12/1}, year = {2013}, publicher={International Federation of Automatic Control}, issn = {1474-6670}, doi = {10.3182/20130811-5-US-2037.00049}, pages = {288--295}, editor = {Narayanan, S.}, abstract = {Control mechanisms in modern Human-Computer Interaction (HCI) underwent a paradigm shift from textual or display-based control to more intuitive control mechanisms, such as speech, gesture and mimic. Especially speech provides a high information density, delivering information about the speaker's inner state as well as his intention and demand. While word-based analyses allow to understand the speaker's request, further speech characteristics reveal the speakers emotion, intention and motivation. Therefore, emotion detection from speech became signicant in modern HCI applications. However, the results from the disciplines involved in the emotion detection are not easily merged. Engineers developing voice controlled HCI systems work in feature spaces", relying on technically measurable acoustic and spectral features. Psychologists analysing and identifying emotions work in emotion categories, schemes or dimensional emotion spaces, describing emotions in terms of quantities and qualities of human notable expressions. While engineering methods notice the slightest variations in speech, emotion theories allow to compare and identify emotions, but must rely on human judgements. However, both perceptions are essential and must be combined to allow machines to allocate aective states during HCI. To provide a link between machine measurable variations in emotional speech and dimensional emotion theory, signicant features describing emotions must be identied and analysed regarding their transferability to emotion space. In this article we present a justiable feature selection for emotion detection from speech and show how to relate measurable features to emotions. We discuss our transformation model and validate both feature selection and model, based on a selection of the Emo-DB corpus.}, keywords={conferenceproc,conference} }

@incollection{siegertHCII:2013, year={2013}, isbn={978-3-642-39341-9}, booktitle={Human-Computer Interaction. Towards Intelligent and Implicit Interaction}, Volume={8008}, series={Lecture Notes in Computer Science}, editor={Kurosu, Masaaki}, doi={10.1007/978-3-642-39342-6_42}, title={The Influence of Context Knowledge for Multi-modal Affective Annotation}, publisher={Springer Berlin, Heidelberg}, author={Siegert, Ingo and Böck, Ronald and Wendemuth, Andreas}, pages={381--390}, abstract = { To provide successful human-computer interaction, automatic emotion recognition from speech experienced greater attention, also increasing the demand for valid data material. Additionally, the difficulty to find appropriate labels is increasing. Therefore, labels, which are manageable by evaluators and cover nearly all occurring emotions, have to be found. An important question is how context influences the annotators’ decisions. In this paper, we present our investigations of emotional affective labelling on natural multi-modal data investigating different contextual aspects. We will explore different types of contextual information and their influence on the annotation process. In this paper we investigate two specific contextual factors, observable channels and knowledge about the interaction course. We discover, that the knowledge about the previous interaction course is needed to assess the affective state, but that the presence of acoustic and video channel can partially replace the lack of discourse knowledge. }, url ={pdf/2013_hcii_sie_preprint.pdf}, keywords={conferencebook,conference} }

@incollection{boeckHCII:2013, year={2013}, isbn={978-3-642-39341-9}, booktitle={Human-Computer Interaction. Towards Intelligent and Implicit Interaction}, Volume={8008}, series={Lecture Notes in Computer Science}, editor={Kurosu, Masaaki}, doi={10.1007/978-3-642-39342-6_33}, title={Audio-Based Pre-classification for Semi-automatic Facial Expression Coding}, publisher={Springer Berlin, Heidelberg}, author={Böck, Ronald and Limbrecht-Ecklundt, Kerstin and Siegert, Ingo and Walter, Steffen and Wendemuth, Andreas}, pages={301--309}, abstract = {The automatic classification of the users’ internal affective and emotional states is nowadays to be considered for many applications, ranging from organisational tasks to health care. Developing suitable automatic technical systems, training material is necessary for an appropriate adaptation towards users. In this paper, we present a framework which reduces the manual effort in annotation of emotional states. Mainly it pre-selects video material containing facial expressions for a detailed coding according to the Facial Action Coding System based on audio features, namely prosodic and mel-frequency features. Further, we present results of first experiments which were conducted to give a proof-of-concept and to define the parameters for the classifier that is based on Hidden Markov Models. The experiments were done on the EmoRec I dataset.}, keywords={conferencebook,conference} }

@incollection{Schmidt:SPIE, AUTHOR = {Schmidt, Daniel and Sadri, Hossein and Szewieczek, Artur and Sinapius, Michael and Wierach, Peter and Siegert, Ingo and Wendemuth, Andreas}, title = {Characterization of Lamb wave attenuation mechanisms}, booktitle = {Proceedings of SPIE Smart Structures and Materials+ Nondestructive Evaluation and Health Monitoring}, volume = {8695}, pages = {869503--869510}, abstract = {Structural Health Monitoring (SHM) based on Lamb waves, a type of ultrasonic guided waves, is a promising technique for in-service inspection of composite structures. This study investigates the attenuation mechanisms of Lamb wave propagation fields. The attenuation of an anisotropic plate is experimental measured with air-coupled ultrasonic scanning techniques and analytical modeled using higher order plate theory. Based on the experimental and analytical data the various attenuation mechanisms are characterized for the fundamental Lamb wave modes.}, year = {2013}, doi = {10.1117/12.2009594}, keywords={conferenceproc,conference} }

@ARTICLE{Siegert:DuePiblico, AUTHOR = {Siegert,Ingo and Hartmann, Kim and Glüge, Stefan and Wendemuth, Andreas}, TITLE = {Modelling of Emotional Development within Human-Computer-Interaction}, JOURNAL = {Kognitive Systeme}, YEAR = {2013}, ISSN = {2197-0343}, PUBLISHER = {Universitätsbibliothek Duisburg-Essen}, abstract = {Future trends point towards the usage of technical systems as companions, adaptable to the user's individual skills, preferences and current emotional state. To enable technical systems to determine a user's emotion, current research focuses on emotion recognition. Besides emotions, personality and moods are eminent as well. Standard emotion recognizers do not consider them adequately and therefore neglect a crucial part of user modelling. The challenge is to gather reliable predictions about the observed emotion of the user and, beyond that, recognise changes in the users emotional reaction during interaction. In this paper we present a mood model that incorporates personality traits based on emotionally labeled data.}, url ={pdf/2013_KogSys_preprint.pdf}, keywords={zeitung} }

@incollection{Krell:MPRSS13, year={2013}, isbn={978-3-642-37080-9}, booktitle={Multimodal Pattern Recognition of Social Signals in Human-Computer-Interaction. First IAPR TC3 Workshop, MPRSS 2012, Tsukuba, Japan, November 11, 2012, Revised Selected Papers}, volume={7742}, series={Lecture Notes in Artificial Intelligence}, editor={Schwenker, Friedhelm and Scherer, Stefan and Morency, Louis-Philippe}, doi={10.1007/978-3-642-37081-6_13}, title={Fusion of Fragmentary Classifier Decisions for Affective State Recognition}, publisher={Springer Berlin, Heidelberg}, author={Krell, Gerald and Glodek, Michael and Panning, Axel and Siegert, Ingo and Michaelis, Bernd and Wendemuth, Andreas and Schwenker, Friedhelm}, pages={116--130}, abstract= {Real human-computer interaction systems based on different modalities face the problem that not all information channels are always available at regular time steps. Nevertheless an estimation of the current user state is required at anytime to enable the system to interact instantaneously based on the available modalities. A novel approach to decision fusion of fragmentary classifications is therefore proposed and empirically evaluated for audio and video signals of a corpus of non-acted user behavior. It is shown that visual and prosodic analysis successfully complement each other leading to an outstanding performance of the fusion architecture.}, url ={pdf/2013_mpr22_preprint.pdf}, keywords={conferencebook,conference} }

@Article{SiegertCOGN:2014, Title = {Investigation of Speaker Group-Dependent Modelling for Recognition of Affective States from Speech}, Author = {Siegert, Ingo and Philippou-Hübner, David and Hartmann, Kim and Böck, Ronald and Wendemuth, Andreas}, Journal = {Cognitive Computation}, Year = {2014}, Number = {4}, Pages = {892--913}, Volume = {6},

Abstract = {For successful human–machine-interaction (HCI) the pure textual information and the individual skills, preferences, and affective states of the user must be known. Therefore, as a starting point, the user’s actual affective state has to be recognized. In this work we investigated how additional knowledge, for example age and gender of the user, can be used to improve recognition of affective state. Two methods from automatic speech recognition are used to incorporate age and gender differences in recognition of affective state: speaker group-dependent (SGD) modelling and vocal tract length normalisation (VTLN). The investigations were performed on four corpora with acted and natural affected speech. Different features and two methods of classification (Gaussian mixture models (GMMs) and multi-layer perceptrons (MLPs)) were used. In addition, the effects of channel compensation and contextual characteristics were analysed. The results are compared with our own baseline results and with results reported in the literature. Two hypotheses were tested. First, incorporation of age information further improves speaker group-dependent modelling. Second, acoustic normalization does not achieve the same improvement as achieved by speaker group-dependent modelling, because the age and gender of a speaker affects the way emotions are expressed.}, Doi = {10.1007/s12559-014-9296-6}, ISSN = {1866-9956}, Publisher = {Springer US}, Url = {http://dx.doi.org/10.1007/s12559-014-9296-6}, keywords={zeitung} }

@InProceedings{Prylipko:INTERSPEECH2014, Title = {Application of Image Processing Methods to Filled Pauses Detection from Spontaneous Speech}, Author = { Prylipko, Dmytro and Egorow, Olga and Siegert,Ingo and Wendemuth,Andreas}, Booktitle = {Proceedings of the INTERSPEECH 2014}, Year = {2014}, Pages = {1816--1820}, month = {09}, address = {Singapore}, keywords={conferenceproc,conference} }

@Article{Prylipko:2013, Title = {Analysis of significant dialog events in realistic human–computer interaction}, Author = {Prylipko, Dmytro and Rösner, Dietmar and Siegert, Ingo and Günther, Stephan and Friesen, Rafael and Haase, Matthias and Vlasenko, Bogdan and Wendemuth, Andreas}, Journal = {Journal on Multimodal User Interfaces}, Year = {2014}, Number = {1}, Pages = {75--86}, Volume = {8},

Abstract = {This paper addresses issues of automatically detecting significant dialog events (SDEs) in naturalistic HCI, and of deducing trait-specific conclusions relevant for the design of spoken dialog systems. We perform our investigations on the multimodal LAST MINUTE corpus with records from naturalistic interactions. First, we used textual transcripts to analyse interaction styles and discourse structures. We found indications that younger subjects prefer a more technical style in communication with dialog systems. Next, we model the subject’s internal success state with a hidden Markov model trained using the observed sequences of system feedback. This reveals that younger subjects interact significantly more successful with technical systems. Aiming on automatic detection of specific subjects’s reactions, we then semi-automatically annotate SDEs—phrases indicating an irregular, i.e. not-task-oriented subject behavior. We use both acoustic and linguistic features to build several trait-specific classifiers for dialog phases, which showed pronouncedly different accuracies for diverse age and gender groups. The presented investigations coherently support age-dependence of both expressiveness and problem-solving ability. This in turn induces design rules for future automatic designated “companion” systems.}, Doi = {10.1007/s12193-013-0144-x}, ISSN = {1783-7677}, Keywords = {Human–computer interaction; Multimodal analysis; Companion technology,zeitung}, Publisher = {Springer Berlin Heidelberg}, Url = {http://dx.doi.org/10.1007/s12193-013-0144-x} }

@Article{Siegert:JMUI, Title = {Inter-rater reliability for emotion annotation in human–computer interaction: comparison and methodological improvements}, Author = {Siegert, Ingo and Böck, Ronald and Wendemuth, Andreas}, Journal = {Journal on Multimodal User Interfaces}, Year = {2014}, Number = {1}, Pages = {17--28}, Volume = {8},

Abstract = {To enable a naturalistic human–computer interaction the recognition of emotions and intentions experiences increased attention and several modalities are comprised to cover all human communication abilities. For this reason, naturalistic material is recorded, where the subjects are guided through an interaction with crucial points, but with the freedom to react individually. This material captures realistic user reactions but lacks of clear labels. So, a good transcription and annotation of the given material is essential. For that, the assignment of human annotators has become widely accepted. A good measurement for the reliability of labelled material is the inter-rater agreement. In this paper we investigate the achieved inter-rater agreement utilizing Krippendorff’s alpha for emotional annotated interaction corpora and present methods to improve the reliability, we show that the reliabilities obtained with different methods does not differ much, so a choice could rely on other aspects. Furthermore, a multimodal presentation of the items in their natural order increases the reliability.}, Doi = {10.1007/s12193-013-0129-9}, ISSN = {1783-7677}, Keywords = {Affective state; Annotation; Context influence; Inter-rater agreement; Labelling,zeitung}, Publisher = {Springer Berlin Heidelberg}, Url = {http://dx.doi.org/10.1007/s12193-013-0129-9} }

@InCollection{SiegertHCII:2014, Title = {Discourse Particles and User Characteristics in Naturalistic Human-Computer Interaction}, Author = {Siegert, Ingo and Haase, Matthias and Prylipko, Dmytro and Wendemuth, Andreas}, Booktitle = {Human-Computer Interaction. Advanced Interaction Modalities and Techniques}, Publisher = {Springer International Publishing}, Year = {2014}, Editor = {Kurosu, Masaaki}, Pages = {492--501}, Series = {Lecture Notes in Computer Science}, Volume = {8511},

Abstract = {In human-human interaction (HHI) the behaviour of the speaker is amongst others characterised by semantic and prosodic cues. These short feedback signals minimally communicate certain dialogue functions such as attention, understanding or other attitudinal reactions. Human-computer interaction (HCI) systems have failed to note and respond to these details so far, resulting in users trying to cope with and adapt to the machines behaviour. In order to enhance HCI, an adaptation to the user’s behaviour, individual skills, and the integration of a general human behaviour understanding is indispensable. Another issue is the question if the usage of feedback signals is influenced by the user’s individuality. In this paper, we investigate the influence of specific feedback signals, known as discourse particles (DPs), with communication style and psychological characteristics within a naturalistic HCI. This investigation showed that there is a significant difference in the usage of DPs for users of certain user characteristics.}, Doi = {10.1007/978-3-319-07230-2_47}, ISBN = {978-3-319-07229-6}, Keywords = {human-machine-interaction; discourse particles; personality; user characteristics,conferencebook,conference}, Url = {http://dx.doi.org/10.1007/978-3-319-07230-2_47} }

@InBook{WIRN2013, Title = {Investigating the Form-Function-Relation of the Discourse Particle “hm” in a Naturalistic Human-Computer Interaction}, Author = {Siegert, Ingo and Prylipko, Dmytro and Hartmann, Kim and Böck, Ronald and Wendemuth, Andreas}, Booktitle = {Recent Advances of Neural Network Models and Applications}, Publisher = {Springer}, Year = {2014}, Editor = {Simone Bassis and Anna Esposito amd Francesco Carlo Morabito}, Pages = {387--394}, Series = {Smart Innovation, Systems and Technologies}, Volume = {26},

Abstract = { For a successful speech-controlled human-computer interaction (HCI) the pure textual information as well as individual skills, preferences, and affective states of the user have to be known. However, verbal human interaction consists of several information layers. Apart from pure textual information, further details regarding the speaker’s feelings, believes, and social relations are transmitted. The additional information is encoded through acoustics. Especially, the intonation reveals details about the speakers communicative relation and their attitude towards the ongoing dialogue.

Since the intonation is influenced by semantic and grammatical information, it is advisable to investigate the intonation of so-called discourse particles (DPs) as “hm” or “uhm”. They cannot be inflected but can be emphasised. DPs have the same intonation curves (pitch-contours) as whole sentences and thus may indicate the same functional meanings. For German language J. E. Schmidt empirically discovered seven types of form-function-concurrences on the isolated DP “hm”.

To determine the function within the dialogue of the DPs, methods are needed that preserve pitch-contours and are feasible to assign defined form-prototypes. Furthermore, it must be investigated which pitch-contours occur in naturalistic HCI and whether these contours are congruent with the findings by linguists.

In this paper we present first results on the extraction and correlation of the DP “hm”. We investigate the different form-function-relations in the naturalistic LAST MINUTE corpus and determine expectable form-function relations in naturalistic HCI in general.

}, Doi = {10.1007/978-3-319-04129-2_39}, ISBN = {978-3-319-04128-5}, Keywords = {Prosodic Analysis; Companion Systems; Human-Computer Interaction; Discourse Particle; Pitch Contour Classification,conferencebook,conference}, Url = {pdf/2014_wirn_preprint.pdf} }

@Inbook{Hartmann2015, Title = {Emotion and Disposition Detection in Medical Machines: Chances and Challenges}, Author = {Hartmann, Kim and Siegert, Ingo and Prylipko, Dmytro}, Booktitle = {Machine Medical Ethics}, Publisher = {Springer International Publishing}, Year = {2015}, Editor = {van Rysewyk, Simon Peter and Pontier, Matthijs}, Pages = {317--339}, Series = {Intelligent Systems, Control and Automation: Science and Engineering}, Volume = {74},

Doi = {10.1007/978-3-319-08108-3_19}, ISBN = {978-3-319-08107-6}, abstract = {Machines designed for medical applications beyond usual data acquisition and processing need to cooperate with and adapt to humans in order to fulfill their supportive tasks. Technically, medical machines are therefore considered as affective systems, capable of detecting, assessing and adapting to emotional states and dispositional changes in users. One of the upcoming applications of affective systems is the use as supportive machines involved in the psychiatric disorder diagnose and therapy process. These machines have the additional requirement of being capable to control persuasive dialogues in order to obtain relevant patient data despite disadvantageous set-ups. These automated abilities of technical systems combined with enhanced processing, storage and observational capabilities raise both chances and challenges in medical applications. We focus on analyzing the objectivity, reliability and validity of current techniques used to determine the emotional states of speakers from speech and the arising implications. We discuss the underlying technical and psychological models and analyze recent machine assessment results of emotional states obtained through dialogues. Conclusively we discuss the involvement of affective systems as medical machines in the psychiatric diagnostics process and therapy sessions with respect to the technical and ethical circumstances.}, keywords={buch} }

@Inproceedings{SiegertESSV:2015, Title = {{Ein Datenset zur Untersuchung emotionaler Sprache in Kundenbindungsdialogen}}, Author = {Siegert,Ingo and Philippou-Hübner,David and Tornow,Michael and Heinemann,Ralph and Wendemuth, Andreas and Ohnemus,Kerstin and Fischer,Sarah and Schreiber,Gerald }, Booktitle = {Elektronische Sprachsignalverarbeitung 2015. Tagungsband der 26. Konferenz}, Pages = {180--187}, Year = {2015}, series = {Studientexte zur Sprachkommunikation}, editor = {Wirsching Günther}, publisher = {TUDpress}, volume = {78}, abstract = {Für die sprachbasierte Emotionserkennung stellen Sprachdaten eine wichtige Ressource dar. Um optimale Klassifizierer zu trainieren, wäre es wünschenswert, wenn die Daten hochqualitativ sind, möglichst lange und reichhaltige Interaktionen umfassen und die Annotation möglichst viele Emotionen abdeckt und dabei reliabel ist. Weiterhin sollte der Datensatz zusätzliche Informationen über die Sprecher, wie Alter, Geschlecht, Dialekt oder Persönlichkeitsmerkmale enthalten.

In diesem Beitrag wird ein neuer Datensatz natürlicher emotionaler Interaktionen vorgestellt, der die meisten Punkte adressiert. Dieser Datensatz ist in einem Forschungsprojekt entstanden, dessen Ziel es ist, ein technisches System zu entwickeln, das einen Call-Center Mitarbeiter dabei unterstützen soll, situationsangemessen auf einen telefonischen Gesprächspartner mit dessen aktuellen Emotionen zu reagieren. Zusätzlich zur emotionalen Annotation wird Crosstalk noch als weiteres Interaktionsmerkmal untersucht, um sowohl das Turntaking-Verhalten als auch die Dominanz der Sprecher zu charakterisieren.}, keywords={kongressbook,kongress} }

@Inproceedings{LotzESSV:2015, Title = {Automatic differentiation of form-function-relations of the discourse particle "hm" in a naturalistic human-computer interaction}, Author = { Lotz,Alicia Flores and Siegert,Ingo and Wendemuth, Andreas }, Booktitle = {Elektronische Sprachsignalverarbeitung 2015. Tagungsband der 26. Konferenz}, Pages = {172--179}, Year = {2015}, series = {Studientexte zur Sprachkommunikation}, editor = {Wirsching Günther}, publisher = {TUDpress}, volume = {78}, abstract = {The development of speech-controlled assistance systems has gained more importance in this day and time. Application ranges from driver assistance systems in the automotive sector to daily use in mobile devices such as smart- phones or tablets. To ensure the reliability of these Systems; not only the meaning of the pure spoken text, but also meta-information about the user or dialogue func- tions such as attention or turn-taking have to be perceived and processed. This further information is transmitted through intonation of words or sentences. In human communication, discourse particles serve to forward information, without interrupting the speaker. For the German language J.E. Schmidt empirically discovered seven types of form-function-concurrences on the isolated DP "hm". To also be considered in human-computer interaction, it is useful to be able to distin- guish these different meanings of the DP "hm". In this paper we present an automatic classification-method to correlate a specific intonation curve to one of the seven form-function-prototypes. To verify the results of the classification algorithm we utilize a manual labeling. }, keywords={kongressbook,kongress} }

@ARTICLE{Kotzyba:DuePiblico, Author = {Kotzyba,Michael and Siegert,Ingo and Gossen,Tatiana and Nürnberger,Andreas and Wendemuth, Andreas }, Title = {Exploratory Voice-Controlled Search for Young Users : Challenges and Potential Benefits}, JOURNAL = {Kognitive Systeme}, YEAR = {2015}, ISSN = {2197-0343}, PUBLISHER = {Universitätsbibliothek Duisburg-Essen}, Volume = {1}, abstract = {To date, children come more and more often into contact with information systems. Unfortunately, these systems are usually designed for adults and the increasing group of young users has been widely neglected. In order to create and support children's information seeking appropriately, we have to consider their specic state of development resulting into dierent needs and capabilities. In this paper, we discuss the design and the results of a user study to analyze exploratory voice-controlled search of young users. Exploratory search nicely re ects real-life information needs and voice-controlled interfaces have the advantage that children do not need to have good spelling skills. Hence the interaction can be more intuitive and motivating. We gained rst insights into the children's search behavior in context of exploratory search and identied design issues for similar studies. Furthermore, we illustrate the challenges and potential benets of interactive voice-controlled search systems for young users.}, keywords={zeitung} }

@ARTICLE{BoeckKoSy2015:DuePiblico, author = {Böck ,Ronald and Siegert,Ingo and Wendemuth, Andreas }, Title = {Probabilistic Breadth used in Evaluation of Resulting Gaussian Mixture Models}, JOURNAL = {Kognitive Systeme}, YEAR = {2015}, ISSN = {2197-0343}, PUBLISHER = {Universitätsbibliothek Duisburg-Essen}, Volume = {2}, abstract = {The recognition of speech and emotions from speech is based on statistical learning methods which are usually highly tuned. Using this kind of technology, it is possible to introduce a human-machine communication and interaction since the machine is able to obtain the content and emotional information from spoken utterances. This provides the opportunity to generate machine which achieve characteristics of cognitive systems. The systems or recognisers are based on learning methods which are well-known. On the other hand, an interpretation or evaluation of such classifiers is usually challenging. For this, we present an approach which allows a more detailed interpretion of the classifier and provides an insight to the method. Our approach is based on the breadth of the resulting Gaussian model which can be generated from the mixture models given by the classifier. We introduce the method and present first results on the EmoDB corpus using a simple classifier with seven mixtures per emotion. Despite this is a show case the classification performance is 64.48\% averaged unweighted average recall. Investigating these models, we draw first conclusion on the characteristics of the Gaussian model applying the breadth as the only parameter.}, keywords={zeitung} }

@inproceedings{SiegertACII, author = {Siegert,Ingo and Böck ,Ronald and Vlasenko,Bogdan and Wendemuth, Andreas }, booktitle = {6th Affective Computing and Intelligent Interaction (ACII 2015)}, title = {Exploring Dataset Similarities using PCA-based Feature Selection}, address = {Xi'an China}, month = {9}, year = {2015}, publisher = {IEEE}, pages ={387--393}, abstract={Abstract—In emotion recognition from speech, several well-established corpora are used to date for the development of classification engines. The data is annotated differently, and the community in the field uses a variety of feature extraction schemes. The aim of this paper is to investigate promising features for individual corpora and then compare the results for proposing optimal features across data sets, introducing a new ranking method. Further, this enables us to present a method for automatic identification of groups of corpora with similar characteristics. This answers an urgent question in classifier development, namely whether data from different corpora is similar enough to jointly be used as training material, overcoming shortage of material in matching domains. We compare the results of this method with manual groupings of corpora. We consider the established emotional speech corpora AVIC, ABC, DES, EMO-DB, ENTERFACE, SAL, SMARTKOM, SUSAS and VAM, however our approach is general.}, keywords={conferenceproc,conference} }

@inproceedings{SiegertISCT, author = {Siegert,Ingo and Ohnemus,Kerstin}, booktitle = {Proc. of the 1st International Symposion on Companion Technology (ISCT 2015)}, title = {A new Dataset of Telephone-Based Human-Human Call-Center Interaction with Emotional Evaluation}, address = {Ulm, Germany}, month = {9}, pages={143--148}, year = {2015}, abstract={Acoustic data are an important resource for speech-based emotion recognition. To obtain optimal recognisers, it would be desirable, when the data are of high quality, include preferably long and elaborate interactions, containing non-verbal events, and having a reliable and versatile emotion annotation. Additionally, the data set should contain additional information about the speakers, such as age, sex, or personality traits. This contribution presents a new dataset of telephone-based interactions recorded under real conditions, addressing most of these requests. Furthermore, first results of acoustic emotion recognition as well as analyses showing a connection between emotional changes and overlap speech segments are presented. }, keywords={conferenceproc,conference} }

@inproceedings{ERM4CTIntro, author = {Hartmann,Kim and Siegert,Ingo and Schuller, Björn and Morency, Louis-Philippe and Salah,Albert Ali and Böck ,Ronald }, booktitle = {Proc. of of the International Workshop on Emotion Representations and Modelling for Companion Technologies (ERM4CT'15)}, title = {ERM4CT 2015: Workshop on Emotion Representations and Modelling for Companion Systems (workshop summary)}, year = {2015}, pages={1--2}, publisher={ACM}, abstract={n this paper the organisers present a brief overview of the Workshop on Emotion Representation and Modelling for Companion Systems (ERM4CT). The ERM4CT 2015 Workshop is held in conjunction with the 17th ACM International Conference on Multimodal Interaction (ICMI 2015) taking place in Seattle, USA. The ERM4CT is the follow-up of three previous workshops on emotion modelling for affective human-computer interaction and companion systems. Apart from its usual focus on emotion representations and models, this year's ERM4HCI puts special emphasis on. }, keywords={other} }

@inproceedings{SiegertCogInfoCom, author = {Siegert,Ingo and Böck ,Ronald and Vlasenko,Bogdan and Ohnemus,Kerstin and Wendemuth, Andreas }, booktitle = {Proc. of 6th IEEE Conference on Cognitive Infocommunications (CogInfoCom 2015)}, title = {Overlapping Speech, Utterance Duration and Affective Content in HHI and HCI - An Comparison}, address = {Györ, Hungary}, month = {10}, year = {2015}, pages={83-88}, abstract={In human conversation, turn-taking is a critical issue. Especially if only the speech channel is available (e.g. telephone), correct timing as well as affective and verbal signals are required. In cases of failure, overlapping speech may occur which is in the focus of this paper. We investigate the davero corpus a large naturalistic spoken corpus of real callcenter telephone conversations and compare our findings to results on the well-known SmartKom corpus consisting of human-computer interaction. We first show that overlapping speech occurs in different types of situational settings – extending the well-known categories cooperative and competitive overlaps -, all of which are frequent enough to be analyzed. Furthermore, we present connections between the occurrence of overlapping speech and the length of the previous utterance, and show that overlapping speech occurs at dialog instances where certain affective states are changing. Our results allow the prediction of forthcoming threat of overlapping speech, and hence preventive measures, especially in professional environments like call-centers with human or automatic agents. }, keywords={conferenceproc,conference} }

@proceedings{Hartmann:2015:2829966, title = {ERM4CT'15: Proceedings of the International Workshop on Emotion Representations and Modelling for Companion Technologies}, author = {Hartmann,Kim and Siegert,Ingo and Schuller, Björn and Morency, Louis-Philippe and Salah, Albert Ali and Böck ,Ronald }, year = {2015}, isbn = {978-1-4503-3988-9}, publisher = {ACM}, address = {New York, NY, USA}, keywords={heraus} }

@inproceedings{BoeckERM4CT, author = {Böck, Ronald and Siegert,Ingo }, booktitle = {Proc. of of the International Workshop on Emotion Representations and Modelling for Companion Technologies (ERM4CT '15)}, title = {Recognising Emotional Evolution from Speech}, address = {Seattle, USA}, month = {11}, year = {2015}, pages={13--18}, publisher={ACM}, abstract={In an interaction, it is well known that information is being exchanged which is meaningful beyond the pure context. Especially, intentional and emotional characteristics are essential to improve not only the relationship in a human-human interaction, but also to provide benefits to increase the understanding of the transmitted information. For technical systems it is favourable to detect and interpret these additional characteristics and to transfer appropriate knowledge to human-machine interactions. In particular, this enables an automatic recognition of emotions. In this paper, we focus on speech as the transmitting modality. Beyond the detection of the location and nature of emotional instances, automatic interpretation asks "if", "how", and "when" emotions change in an interaction. In the EmoGest corpus, providing human-human interactions with emotionally induced subjects, we analyse the subject's emotional evolution. For this, we investigate the utterances spoken at the beginning and at the end of particular sessions. Automatic emotion recognition from speech shows that in intra- and inter-individual evaluations, significant differences (at least p<0.05) in the emotional state train of the subjects can be detected. Intra-individual experiments indicate a trend towards less affective states in subsequent interactions if no further inducement is given. In addition, the inter-individual analyses show that a kind of alignment can be seen in the emotional state trains of the single subjects even if their sessions were strictly separated. }, keywords={conferenceproc,conference} }

@inproceedings{TornowMMC2016, author = { Tornow,Michael and Krippl,Martin and Bade, Svea and Thiers,Angelina and Siegert, Ingo and Handrich, Sebastian and Krüger, Julia and Schega, Lutz and Wendemuth, Andreas}, booktitle = {Multimodal Corpora: Computer vision and language processing (MMC 2016). Workshop Programme}, title = {Integrated Health and Fitness (iGF)-Corpus - ten-Modal Highly Synchronized Subject-Dispositional and Emotional Human Machine Interactions}, address = {Portorož, Slovenia}, month = {05}, year = {2016}, abstract={A multimodal corpus on human machine interaction in the area of health and fitness is introduced in this paper. It shows the interaction of a user with a gait training system. The subjects pace through a training course four times. In the intermissions, they interact with a multimodal platform, where they are given feedback, they re-assess their performance and they plan the next steps. A high involvement of the subjects is given. By design, the interaction further evokes mental underload and overload and emotional reactions. The platform interaction was arranged as a Wizard of Oz Setup.
In the interaction phase, 10 modalities are recorded in 20 sensory channels with high performance of hardware synchronicity, including several high-resolution cameras, headset and directional microphones, biophysiology, 3D data as well as skeleton and face detection information. In the corpus, 65 subjects are recorded in the interaction sessions for a total of 100 minutes per subject, including self-ratings from eight time points during the experiment. Additionally, several questionnaires are available from all subjects, regarding personality traits, including technical and stress coping behavior. }, keywords={conferenceproc,conference} }

@ARTICLE{Lotz:DuePiblico2016, author = { Lotz.Alicia Flores and Siegert, Ingo and Wendemuth, Andreas}, Title = {Comparison of Different Modeling Techniques for Robust Prototype Matching of Speech Pitch-Contours}, JOURNAL = {Kognitive Systeme}, YEAR = {2016}, ISSN = {2197-0343}, PUBLISHER = {Universitätsbibliothek Duisburg-Essen}, Volume = {1}, abstract = {The goal of this investigation is to enable an automatic classification for the discourse particle (DP) “hm”. As stated in the introduction, in human communication the interlocutors are able to differentiate the functional meaning of DPs only by their intonation. The intonational curve can be represented using the pitch-values extracted from the raw speech material. This estimation of the pitch is a separate field of research and has three major difficulties, due to continuous speech and the short duration of the DPs: 1) the beginning and end of the DP can interfere with other spoken content, 2) gaps within the acoustical course, and 3) wrongly estimated pitch- values. To process these errors and reduce the complexity of the following analysis algorithm a pre-processing of the pitch-contour is necessary. The pre-processing of the pitch-contour will reduce the number of pitch-values. Being one of the smallest utterance units, the number of measuring points for “hm” is already small which leads to the main issue of this paper: The development of a method which ensures a reliable and robust classification of the DP “hm”. Therefore, three different methods will be presented and compared to evaluate the best one. Furthermore, to ensure the reliability of the algorithm not only for human-computer interaction (HCI), and to gain more training data, tests were carried out on two datasets containing HCI and human-human interaction (HHI).}, keywords={zeitung} }

@Inproceedings{SiegertESSV:2016Duong, Title = {Measuring the impact of audio compression on the spectral quality of speech data}, Address = {Leipzig, Germany}, Author = { Siegert,Ingo and Lotz,Alicia Flores and Duong,Linh Linda and Wendemuth,Andreas}, Booktitle = {Elektronische Sprachsignalverarbeitung 2016. Tagungsband der 27. Konferenz}, Pages = {229--236}, Year = {2016}, series = {Studientexte zur Sprachkommunikation}, editor = {Jokisch Oliver}, publisher = {TUDpress}, volume = {81}, abstract = {Due to the grown use of speech technology, the need for efficient data storage becomes increasingly important. In this paper, we investigate whether well-known audio-codecs for music data can be used to store speech data without introducing too much spectral errors. Our investigations are concluded with the recommendation to use Ogg Vorbis in its highest quality setting for data storage. }, keywords={kongressbook,kongress} }

@Inproceedings{SiegertESSV:2016DS3, Title = {Multimodal Information Processing: The Ticket Purchase - a Demonstration Scenario of the SFB/TRR-62}, Address = {Leipzig, Germany}, Author = { Siegert, Ingo and Reuter,Stephan and Schüssel.Felix and Layher.Georg and Hörnle,Thilo and Meudt, Sascha and Wendemuth,Andreas}, Booktitle = {Elektronische Sprachsignalverarbeitung 2016. Tagungsband der 27. Konferenz}, Pages = {111--118}, Year = {2016}, series = {Studientexte zur Sprachkommunikation}, editor = {Jokisch, Oliver}, publisher = {TUDpress}, volume = {81}, abstract = {The demonstration scenario of the SFB/TRR-62 shows multimodal, dynamic interactions between a human being and a technical system that are adaptive to the situation and the emotional state. It uses the example of purchasing a train ticket to demonstrate how a companion system is able to adapt its dialog with the user according to the situational context and the emotions of the user. One special feature of this scenario are the simultaneous analyses and evaluations of explicit and implicit input data.

The scenario demonstrates further how background knowledge about the user can be included; for example often visited destinations, the user's timetable or the number of travelers. }, keywords={kongressbook,kongress} }

@article{SiegertKI2015, year={2016}, issn={0933-1875}, journal={KI - Künstliche Intelligenz}, doi={10.1007/s13218-015-0394-8}, title={Emotional and User-Specific Acoustic Cues for Improved Analysis of Naturalistic Interactions}, note={Doctoral and Postdoctoral Dissertations}, url={http://dx.doi.org/10.1007/s13218-015-0394-8}, publisher={Springer Berlin Heidelberg}, author={Siegert, Ingo}, pages={93--94}, volume={30}, number={1}, abstract={A companion system is distinguished from today's technical systems by the fact that it addresses individual needs and abilities of a user by adapting towards him. Furthermore a companion system allows a naturalistic human-computer interaction. The dissertation discussed in this article investigates improvements for natural speech-based interaction in the example of four open issues. These issues should be resolved to allow future systems to act like a companion system.

As first open issue, methodological improvements for the annotation of emotions as well as studies of inter-rater reliability to evaluate emotions are examined. The second open issue analyses how speaker-group-dependent models can improve the emotion recognition. This allows companion systems to respond in an individual way to a user, although no adaptation for the specific user has been conducted. The third open issue deals with so-called discourse particles and analyses if they have the same functional meanings in human-computer interaction as in human to human communication. Furthermore, it is investigated if this meaning could be distinguished by the pitch contour. In the fourth open issue, the user's long-term emotional development is modelled by using a physically inspired spring model. Additionally, the integration of the user's personality trait "extraversion" is discussed.

The incorporation of the methods discussed in relation with the investigated four open issues allow companion systems to behave more like a human interlocutor. The discussed thesis especially points towards possibilities to implement an adaptive and individual companion system.}, keywords={zeitung} }

@incollection{LotzHCII2016, Title = {Classification of Functional-Meanings of Non-isolated Discourse Particles in Human-Human-Interac"-tion}, Author = {Lotz, Alicia Flores and Siegert,Ingo and Wendemuth, Andreas}, Booktitle = {Human-Computer Interaction. Theory, Design, Development and Practice}, Pages = {53--64}, Year = {2016}, series = {Lecture Notes in Computer Science}, editor = {Masaaki Kurosu}, publisher = {Springer}, volume = {9731}, doi = {10.1007/978-3-319-39510-4_6}, abstract = {To enable a natural interaction with future technical systems, not only the meaning of the pure spoken text, but also meta-information such as attention or turn-taking has to be perceived and processed. This further information is effectively transmitted by semantic and prosodic cues, without interrupting the speaker. For the German language we rely on previous empirically discovered seven types of form-function-concurrences on the isolated discourse particle (DP) “hm”.

In this paper we present an improved automatic classification-method towards non-isolated DPs in human-human interaction (HHI). We show that classifiers trained on (HCI)-data can be used to robustly evaluate the contours of DPs in both HCI and HHI by performing a classifier adaptation to HHI data. We also discuss the problem of the pitch-contour extraction due to the unvoiced “hm”-utterances, leading to gaps and/or jumps in the signal and hence to confusions in form-type classifications. This can be alleviated by our investigation of contours with high extraction completion grade. We also show that for the acoustical evaluation of the functional-meaning, the idealized form-function prototypes by Schmidt are not suitable in case of naturalistic HHI. However, the precision of acoustical-meaning prediction with our classifier remains high. }, keywords={conferencebook,conference} }

@incollection{SiegertHCII2016, Title = {Discourse Particles in Human-Human and Human-Computer Interaction – Analysis and Evaluation}, Author = {Siegert, Ingo and Krüger, Julia and Haase,Matthias and Lotz,Alicia Flores and Günther,Stephan and Frommer, Jörg and Rösner,Dietmar and Wendemuth,Andreas}, Booktitle = {Human-Computer Interaction. Theory, Design, Development and Practice}, Pages = {105--117}, Year = {2016}, series = {Lecture Notes in Computer Science}, editor = {Masaaki Kurosu}, publisher = {Springer}, volume = {9731}, doi = {10.1007/978-3-319-39510-4_11}, abstract = {Discourse particles are verifiably used in both human-human interaction (HHI) and human-computer interaction (HCI). In both types of interaction form-function-relations could be confirmed. Also correlations with specific subject characteristics, personality traits and the use of these particles could be uncovered. But these investigations are performed on separated datasets containing either HHI or HCI. Moreover, the subjects analyzed in both interaction types are not the same and thus, direct connections could not be made.

In our contribution, we report about analyses of discourse particles in both HHI and HCI with the same subjects. This enables us to draw conclusions of the communication partner’s influence in relation to subject characteristics and personality traits. This will prospectively help to better understand the use of discourse particles. By using this knowledge, future technical systems can react to known subjects more individually. }, keywords={conferencebook,conference} }

@Inproceedings{SiegertITG2016, Title = {Emotion Intelligibility within Codec-Compressed and Reduced Bandwith Speech }, Author = {Siegert, Ingo and Lotz, Alicia Flores and Maruschke, Michael and Jokisch, Oliver and Wendemuth, Andreas }, Booktitle = {ITG-Fb. 267: Speech Communication : 12. ITG-Fachtagung Sprachkommunikation}, Pages = {215--219}, month = {10}, address = {Paderborn, Germany}, Year = {2016}, publisher = {VDE Verlag}, keywords={kongressbook,kongress} }

@Inproceedings{IntroERM4CT, Title = {ERM4CT 2016: 2nd international workshop on emotion representations and modelling for companion systems (workshop summary)}, Author = {Hartmann, Kim and Siegert, Ingo and Salah, Ali Albert and Truong, Khiet P. }, Booktitle = {Proceedings of the 18th ACM International Conference on Multimodal Interaction}, Pages = {593--595}, Year = {2016}, publisher = {ACM}, doi = {10.1145/2993148.3007630}, keywords={other} }

@proceedings{Hartmann:2016, title = {ERM4CT '16: Proceedings of the 2nd International Workshop on Emotion Representations and Modelling for Companion Technologies}, author = {Hartmann, Kim and Siegert, Ingo and Salah, Ali Albert and Truong, Khiet P. }, year = {2016}, publisher = {ACM}, address = {New York, NY, USA}, keywords={heraus} }

@Inproceedings{ThiersSmartASSIST, Title = {Kennzeichnung von Nutzerprofilen zur Interaktionssteuerung beim Gehen}, Author = { Thiers, Angelina and Hamacher, Dennis and Tornow, Michael and Heinemann, Ralph and Siegert, Ingo and Wendemuth, Andreas and Schega, Lutz}, Booktitle = {Zweite transdisziplinäre Konferenz. Technische Unterstütz"=ungssysteme, die die Menschen wirklich wollen}, Pages = {475--484}, month = {12}, address = {Hamburg, Germany}, Year = {2016}, keywords={kongressbook,kongress} }

@Inproceedings{SiegertSmartASSIST, Title = {Akustische Marker für eine verbesserte Situations- und Intentionserkennung von technischen Assistenzsystemen}, Author = {Siegert, Ingo and Lotz, Alicia Flores and Egorow, Olga and Böck, Ronald and Schega, Lutz and Tornow, Michael and Thier, Angelina and Wendemuth, Andreas}, Booktitle = {Zweite transdisziplinäre Konferenz. Technische Unterstütz"=ungssysteme, die die Menschen wirklich wollen}, Pages = {465--474}, month = {12}, address = {Hamburg, Germany}, Year = {2016}, keywords={kongressbook,kongress} }

@INPROCEEDINGS{8287079, author={Egorow, Olga and Lotz, Alicia and Siegert, Ingo and Böck, Ronald and Krüger, Julia and Wendemuth, Andreas}, booktitle={2017 International Conference on Companion Technology (ICCT)}, title={Accelerating manual annotation of filled pauses by automatic pre-selection}, year={2017}, pages={1-6}, keywords={conferencebook,conference}, doi={10.1109/COMPANION.2017.8287079}, ISSN={}, month={Sept} }

@Inbook{CompanionBuchkap4, author={Gossen, Tatiana and Siegert, Ingo and Nürnberger, Andreas and Hartmann, Kim and Kotzyba, Michael and Wendemuth, Andreas }, title={Modeling aspects in human-computer interaction - adaptivity, user characteristics and evaluation}, bookTitle={Companion technology: a paradigm shift in human-technology interaction}, year={2017}, publisher={Springer International Publishing}, address={Cham}, pages={57--78}, isbn={978-3-319-43665-4}, url={https://doi.org/10.1007/978-3-319-43665-4_4}, keywords={buch} }

@Inbook{CompanionBuchkap14, author={Rösner, Dietmar and Frommer, Jörg and Wendemuth, Andreas and Bauer, Thomas and Günther, Stephan and Haase, Matthias and Siegert, Ingo }, title={The last minute corpus as a research resource - from signal processing to behavioral analyses in user-companion interactions}, bookTitle={Companion technology: a paradigm shift in human-technology interaction}, year={2017}, publisher={Springer International Publishing}, address={Cham}, pages={277--299}, isbn={978-3-319-43665-14}, url={https://doi.org/10.1007/978-3-319-43665-4_14}, keywords={buch} }

@Inbook{CompanionBuchkap19, author={Schwenker, Friedhelm and Böck, Ronald and Schels, Martin and Meudt, Sascha and Siegert, Ingo and Glodek, Michael and Kächele, Markus and Schmidt-Wack, Miriam and Thiam, Patrick and Wendemuth, Andreas and Krell, Gerald}, title={Multimodal affect recognition in the context of human-computer interaction for companion-systems}, bookTitle={Companion technology: a paradigm shift in human-technology interaction}, year={2017}, publisher={Springer International Publishing}, address={Cham}, pages={378--408}, isbn={978-3-319-43665-19}, url={https://doi.org/10.1007/978-3-319-43665-4_19}, keywords={buch} }

@Inbook{CompanionBuchkap20, author={Wendemuth, Andreas and Vlasenko, Bogdan and Siegert, Ingo and Böck, Ronald and Schwenker, Friedhelm and Palm, Günther }, title={Emotion recognition from speech}, bookTitle={Companion technology: a paradigm shift in human-technology interaction}, year={2017}, publisher={Springer International Publishing}, address={Cham}, pages={409--428}, isbn={978-3-319-43665-20}, url={https://doi.org/10.1007/978-3-319-43665-4_20}, keywords={buch} }

@Inbook{CompanionBuchkap25, author={Siegert, Ingo and Schüssel, Felix and Schmidt, Miriam and Reuter, Stephan and Meudt, Sascha and Layher, Georg and Krell, Gerald and Hörnle, Thilo and Handrich, Sebastian and Al-Hamadi, Ayoub and Dietmayer, Klaus and Neumann, Heiko and Palm, Gün"=ther and Schwenker, Friedhelm and Wendemuth, Andreas }, title={Multi-modal information processing in companion-systems - a ticket purchase system}, bookTitle={Companion technology: a paradigm shift in human-technology interaction}, year={2017}, publisher={Springer International Publishing}, address={Cham}, pages={493--500}, isbn={978-3-319-43665-25}, url={https://doi.org/10.1007/978-3-319-43665-4_25}, keywords={buch} }

@Inbook{Boeck2017, author={Böck, Ronald and Egorow, Olga and Siegert, Ingo and Wendemuth, Andreas}, editor={Horain, Patrick and Achard, Catherine and Mallem, Malik}, title={Comparative Study on Normalisation in Emotion Recognition from Speech}, bookTitle={Proceedings of the 9th International Conference on Intelligent Human Computer Interaction (IHCI 2017)}, year={2017}, publisher={Springer International Publishing}, address={Cham}, pages={189--201}, abstract={The recognition performance of a classifier is affected by various aspects. A huge influence is given by the input data pre-processing. In the current paper we analysed the relation between different normalisation methods for emotionally coloured speech samples deriving general trends to be considered during data pre-processing. From the best of our knowledge, various normalisation approaches are used in the spoken affect recognition community but so far no multi-corpus comparison was conducted. Therefore, well-known methods from literature were compared in a larger study based on nine benchmark corpora, where within each data set a leave-one-speaker-out validation strategy was applied. As normalisation approaches, we investigated standardisation, range normalisation, and centering. These were tested in two possible options: (1) The normalisation parameters were estimated on the whole data set and (2) we obtained the parameters by using emotionally neutral samples only. For classification Support Vector Machines with linear and polynomial kernels as well as Random Forest were used as representatives of classifiers handling input material in different ways. Besides further recommendations we showed that standardisation leads to a significant improvement of the recognition performance. It is also discussed when and how to apply normalisation methods.}, isbn={978-3-319-72038-8}, doi={10.1007/978-3-319-72038-8_15}, url={https://doi.org/10.1007/978-3-319-72038-8_15}, keywords={conferencebook,conference} }

@ARTICLE{Egorow:DuePiblico2017, author = {Egorow, Olga and Siegert, Ingo and Wendemuth, Andreas}, Title = {Prediction of user satisfaction in naturalistic human-computer interaction}, Journal = {Kognitive Systeme}, year = {2017}, number={2017}, issue=1, issn = {2197-0343}, publisher = {Universitätsbibliothek Duisburg-Essen}, Issue = {1}, keywords={zeitung} }

@Inbook{SiegertSpecomHftl2017, author={Siegert, Ingo and Lotz, Alicia Flores and Egorow, Olga and Wendemuth, Andreas}, editor={Karpov, Alexey and Potapova, Rodmonga and Mporas, Iosif}, title={Improving Speech-Based Emotion Recognition by Using Psychoacoustic Modeling and Analysis-by-Synthesis}, bookTitle={Proceedings of SPECOM 2017, 19th International Conference Speech and Computer}, year={2017}, publisher={Springer International Publishing}, address={Cham}, pages={445--455}, abstract={Most technical communication systems use speech compression codecs to save transmission bandwidth. A lot of development was made to guarantee a high speech intelligibility resulting in different compression techniques: Analysis-by-Synthesis, psychoacoustic modeling and a hybrid mode of both. Our first assumption is that the hybrid mode improves the speech intelligibility. But, enabling a natural spoken conversation also requires affective, namely emotional, information, contained in spoken language, to be intelligibly transmitted. Usually, compression methods are avoided for emotion recognition problems, as it is feared that compression degrades the acoustic characteristics needed for an accurate recognition [1]. By contrast, in our second assumption we state that the combination of psychoacoustic modeling and Analysis-by-Synthesis codecs could actually improve speech-based emotion recognition by removing certain parts of the acoustic signal that are considered ``unnecessary'', while still containing the full emotional information. To test both assumptions, we conducted an ITU-recommended POLQA measuring as well as several emotion recognition experiments employing two different datasets to verify the generality of this assumption. We compared our results on the hybrid mode with Analysis-by-Synthesis-only and psychoacoustic modeling-only codecs. The hybrid mode does not show remarkable differences regarding the speech intelligibility, but it outperforms all other compression settings in the multi-class emotion recognition experiments and achieves even an {\$}{\$}{\backslash}sim {\$}{\$} 3.3{\%} absolute higher performance than the uncompressed samples.}, isbn={978-3-319-66429-3}, doi={10.1007/978-3-319-66429-3_44}, url={https://doi.org/10.1007/978-3-319-66429-3_44}, keywords={conferencebook,conference} }

@Inbook{SiegertSpecom2017, author={Siegert, Ingo and Jokisch, Oliver and Lotz, Alicia Flores and Trojahn, Franziska and Meszaros, Martin and Maruschke, Michael}, editor={Karpov, Alexey and Potapova, Rodmonga and Mporas, Iosif}, title={Acoustic Cues for the Perceptual Assessment of Surround Sound}, bookTitle={Proceedings of SPECOM 2017, 19th International Conference Speech and Computer}, year={2017}, publisher={Springer International Publishing}, address={Cham}, pages={65--75}, abstract={Speech and audio codecs are implemented in a variety of multimedia applications, and multichannel sound is offered by first streaming or cloud-based services. Beside the objective of perceptual quality, coding-related research is focused on low bitrate and minimal latency. The IETF-standardized Opus codec provides a high perceptual quality, low latency and the capability of coding multiple channels in various audio bandwidths up to Fullband (20 kHz). In a previous perceptual study on Opus-processed 5.1 surround sound, uncompressed and degraded stimuli were rated on a five-point degradation category scale (DMOS) for six channels at total bitrates between 96 and 192 kbit/s. This study revealed that the perceived quality depends on the music characteristics. In the current study we analyze spectral and music-feature differences between those five music stimuli at three coding bitrates and uncompressed sound to identify objective causes for perceptual differences. The results show that samples with annoying audible degradations involve higher spectral differences within the LFE channel as well as highly uncorrelated LSPs.}, isbn={978-3-319-66429-3}, doi={10.1007/978-3-319-66429-3_6}, url={https://doi.org/10.1007/978-3-319-66429-3_6}, keywords={conferencebook,conference} }

@Inproceedings{LotzESSV:2017, Title = {Audio Compression and its Impact on Emotion Recognition in Affective Computing}, Address = {Saarbrücken, Germany}, Author = {Lotz, Alicia Flores and Siegert, Ingo and Maruschke, Michael and Wendemuth, Andreas}, Booktitle = {Elektronische Sprachsignalverarbeitung 2017. Tagungsband der 28. Konferenz}, Pages = {1--8}, Year = {2017}, series = {Studientexte zur Sprachkommunikation}, editor = {Jürgen Trouvain, Ingmar Steiner, Bernd Möbius}, publisher = {TUDpress}, volume = {86}, abstract = {Abstract: Enabling a natural (human-like) spoken conversation with technical systems requires affective information, contained in spoken language, to be intelligibly transmitted. This study investigates the role of speech and music codecs for affect intelligibility. A decoding and encoding of affective speech was employed from the well-known EMO-DB corpus. Using four state-of-the-art acoustic codecs and different bit-rates, the spectral error and the human affect recognition ability in labeling experiments were investigated and set in relation to results of automatic recognition of base emotions. Through this approach, the general affect intelligibility as well as the emotion specific intelligibility was analyzed. Considering the results of the conducted automatic recognition experiments, the SPEEX codec configuration with a bit-rate of 6.6 kbit/s is recommended to achieve a high compression and overall good UARs for all emotions.}, keywords={kongressbook,kongress} }

@Inproceedings{SiegertESSV:2017, Title = {ikannotate2 – A Tool Supporting Annotation of Emotions in Audio-Visual Data}, Address = {Saarbrücken, Germany}, Author = {Siegert, Ingo and Wendemuth, Andreas}, Booktitle = {Elektronische Sprachsignalverarbeitung 2017. Tagungsband der 28. Konferenz}, Pages = {17--24}, Year = {2017}, series = {Studientexte zur Sprachkommunikation}, editor = {Jürgen Trouvain, Ingmar Steiner, Bernd Möbius}, publisher = {TUDpress}, volume = {86}, abstract = {For emotional analyses of interactions, qualitatively high transcription and annotation of given material is important. The textual transcription can be conducted with several available tools, like e.g. Folker or ANVIL. But tools for the annotation of emotions are quite rare. Furthermore, existing tools only allow to select an emotion term from a list of terms. Thus, a relation between the different emotional terms that has been uncovered by psychologists get lost. In this paper, we present an enhanced version of the tool ikannotate that is able to add an emotional annotation onto already transcribed material. This tool relies on established emotion labelling methods, like the Geneva Emotion Wheel or the Self Assessment Manikins to maintain the relationship. Furthermore, the annotator is guided by a step-wise process to improve the reliability of the emotional annotation. Additionally, the uncertainty in assessing emotions can be covered as well, to evaluate the labels afterwards and exclude samples with to low uncertainty from further analyses. The tool ikannotate2 can be used under Windows, Linux and macOS. All settings can be changed via corresponding INI-files. }, keywords={kongressbook,kongress} }

@INPROCEEDINGS{8584339, author={{Trujillo}, Michael Olmos and {Adamatti}, Diana F. and {Siegert}, Ingo}, booktitle={2018 Congreso Argentino de Ciencias de la Informática y Desarrollos de Investigación (CACIDI)}, title={Using Category Theory to Structure the OCC Theory of emotions}, year={2018}, pages={1-6}, keywords={conferenceproc,conference} }

@Inbook{SiegertSpecom2018, author={Siegert, Ingo and Lotz, Alicia Flores and Egorow, Olga and Wolff, Susann}, editor={Karpov, Alexey and Jokisch, Oliver and Potapova, Rodmonga}, title={Utilizing psychoacoustic modeling to improve speech-based emotion recognition}, bookTitle={Proceedings of SPECOM 2018, 20th International Conference Speech and Computer}, year={2018}, publisher={Springer International Publishing}, address={Cham}, pages={625--635}, keywords={conferencebook,conference} }

@Inbook{EgorowSpecom2018, author={Egorow, Olga and Siegert, Ingo and Wendemuth, Andreas }, editor={Karpov, Alexey and Jokisch, Oliver and Potapova, Rodmonga}, title={Improving emotion recognition performance by random-forest-based feature selection}, bookTitle={Proceedings of SPECOM 2018, 20th International Conference Speech and Computer}, year={2018}, publisher={Springer International Publishing}, address={Cham}, pages={134--144}, keywords={conferencebook,conference} }

@ARTICLE{KogSy3, author = {Lotz, Alicia Flores and Wilbrink, Marc and Siegert, Ingo and Jipp, Meike and Wendemuth, Andreas}, Title = {An experimental paradigm for inducing emotions in a real world driving scenario evidence from self-report, annotation of speech data and peripheral physiology}, JOURNAL = {Kognitive Systeme}, YEAR = {2018}, ISSN = {2197-0343}, PUBLISHER = {Universitätsbibliothek Duisburg-Essen}, Issue = {1}, keywords={zeitung} }

@ARTICLE{KogSy2, author = {Siegert, Ingo and Krüger, Julia}, Title = {How do we speak with ALEXA - subjective and objective assessments of changes in speaking style between HC and HH conversations}, JOURNAL = {Kognitive Systeme}, YEAR = {2018}, ISSN = {2197-0343}, PUBLISHER = {Universitätsbibliothek Duisburg-Essen}, Issue = {1}, keywords={zeitung} }

@Inproceedings{KogSy1, author = {Höbel-Müller, Juliane and Siegert, Ingo and Weißkirchen, Norman and Wendemuth, Andreas}, Title = {EVALUATION OF AN I-VECTOR BASED ELDERLY IDENTIFICATION SYSTEM FOR AMBIENT INTELLIGENCE}, booktitle = {7. Workshop Kognitive Systeme: Mensch, Teams, Systeme und Automaten,}, YEAR = {2018}, address = {Braunschweig}, note={akzeptiert}, keywords={kongressproc,kongress} }

@article{SIEGERT20181, title = {Using a PCA-based dataset similarity measure to improve cross-corpus emotion recognition}, journal = {Computer Speech \& Language}, volume = {51}, pages = {1 - 23}, year = {2018}, issn = {0885-2308}, doi = {https://doi.org/10.1016/j.csl.2018.02.002}, url = {http://www.sciencedirect.com/science/article/pii/S0885230816302650}, author = {Siegert, Ingo and Böck, Ronald and Wendemuth, Andreas}, keywords = {zeitung,PCA, Dataset similarity, Cross-corpus emotion recognition, Automatic similarity scoring}, abstract = {In emotion recognition from speech, huge amounts of training material are needed for the development of classification engines. As most current corpora do not supply enough material, a combination of different datasets is advisable. Unfortunately, data recording is done differently and various emotion elicitation and emotion annotation methods are used. Therefore, a combination of corpora is usually not possible without further effort. The manuscript’s aim is to answer the question which corpora are similar enough to jointly be used as training material. A corpus similarity measure based on PCA-ranked features is presented and similar datasets are identified. To evaluate our method we used nine well-known benchmark corpora and automatically identified a sub-set of six most similar datasets. To test that the identified most similar six datasets influence the classification performance, we conducted several cross-corpora emotion recognition experiments comparing our identified six most similar datasets with other combinations. Our most similar sub-set outperforms all other combinations of corpora, the combination of all nine datasets as well as feature normalization techniques. Also influencing side-effects on the recognition rate were excluded. Finally, the predictive power of our measure is shown: increasing similarity score, expressing decreasing similarity, result in decreasing recognition rates. Thus, our similarity measure answers the question which corpora should be included into joint training.} }

@Inproceedings{SiegertMMC:2018, author = {Siegert, Ingo and Krüger, Julia and Egorow, Olga and Nietzold, Jannik and Heinemann, Ralph and Lotz, Alicia Flores}, title = {Voice Assistant Conversation Corpus (VACC): A Multi-Scenario Dataset for Addressee Detection in Human-Computer-Interaction using Amazon's ALEXA}, booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, year = {2018}, month = {may}, date = {7-12}, location = {Miyazaki, Japan}, editor = {Hanae Koiso and Patrizia Paggio}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {979-10-95546-16-0}, language = {english}, keywords={conferenceproc,conference} }

@Inproceedings{LotzESSV:2018, Title = {Emotion Recognition from Disturbed Speech -- towards Affective Computing in Real-World In-Car Environment}, Address = {Ulm, Germany}, Author = {Lotz, Alicia Flores and Faller, Fabian and Siegert, Ingo and Wendemuth, Andreas}, Booktitle = {Elektronische Sprachsignalverarbeitung 2017. Tagungsband der 28. Konferenz}, Pages = {208--215}, Year = {2018}, series = {Studientexte zur Sprachkommunikation}, editor = {Andre Berton, Udo Haiber, Wolfgang Minker}, publisher = {TUDpress}, volume = {90}, keywords={kongressbook,kongress} }

@Inproceedings{SiegertESSV:2018, Title = {Acoustic Addressee-Detection -- Analysing the Impact of Age, Gender and Technical Knowledge}, Address = {Ulm, Germany}, Author = {Siegert, Ingo and Shuran, Tang and Lotz, Alicia Flores}, Booktitle = {Elektronische Sprachsignalverarbeitung 2017. Tagungsband der 28. Konferenz}, Pages = {113--120}, Year = {2018}, series = {Studientexte zur Sprachkommunikation}, editor = {Andre Berton, Udo Haiber, Wolfgang Minker}, publisher = {TUDpress}, volume = {90}, keywords={kongressbook,kongress} }

@inproceedings{Raveh2019ThreesAC, title={Three’s a Crowd? Effects of a Second Human on Vocal Accommodation with a Voice Assistant}, author={Raveh, Eran and Siegert, Ingo and Steiner, Ingmar and Gessinger, Iona and M{\"o}bius, Bernd}, booktitle={INTERSPEECH 2019}, year={2019}, pages= {4005--4009}, keywords={conferenceproc,conference} }

@Inbook{10.1007/978-3-030-26061-3_19, author="Jokisch, Oliver and Siegert, Ingo and Maruschke, Michael and Strutz, Tilo and Ronzhin, Andrey", editor="Salah, Albert Ali and Karpov, Alexey and Potapova, Rodmonga", title="Don't Talk to Noisy Drones -- Acoustic Interaction with Unmanned Aerial Vehicles", booktitle="Speech and Computer", year="2019", publisher="Springer International Publishing", address="Cham", pages="180--190", abstract="Common applications of an unmanned aerial vehicle (UAV, aerial drone) utilize the capabilities of mobile image or video capturing, whereas our article deals with acoustic-related scenarios. Especially for surveillance tasks, e.g. in disaster management or measurement of artificial environmental noise in large industrial areas, an UAV-based acoustic interaction or measurement can be important tasks. A sound and speech signal processing at UAVs is complex because of rotor and maneuver-related noise components. The signal processing has to consider various sound sources, and the wanted signals (e.g. artificial environmental noise or speech signals) have to be separated from the UAVs' own flight and wind noise. The contribution discusses the acoustic scenarios and some acoustic characteristics of a sample UAV, including the effect of flight maneuvers. We recorded speech signals in best practice with regard to the outcome of our preliminary analyses and then conducted objective speech quality measurements and speech recognition experiments with a state-of-the-art recognizer. Aside, the measurability of environmental noise signals is analyzed exemplarily. The article concludes with lessons learned for acoustic UAV interactions or measurements and preliminary thoughts with regard to a novel category of 'low-noise' UAVs.", isbn="978-3-030-26061-3", keywords={conferenceproc,conference} }

@inproceedings{akhtiamov-etal-2019-cross, title = "Cross-Corpus Data Augmentation for Acoustic Addressee Detection", author = "Akhtiamov, Oleg and Siegert, Ingo and Karpov, Alexey and Minker, Wolfgang", booktitle = "Proceedings of the 20th Annual SIGdial Meeting on Discourse and Dialogue", month = sep, year = "2019", address = "Stockholm, Sweden", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/W19-5933", doi = "10.18653/v1/W19-5933", pages = "274--283", abstract = "Acoustic addressee detection (AD) is a modern paralinguistic and dialogue challenge that especially arises in voice assistants. In the present study, we distinguish addressees in two settings (a conversation between several people and a spoken dialogue system, and a conversation between several adults and a child) and introduce the first competitive baseline (unweighted average recall equals 0.891) for the Voice Assistant Conversation Corpus that models the first setting. We jointly solve both classification problems, using three models: a linear support vector machine dealing with acoustic functionals and two neural networks utilising raw waveforms alongside with acoustic low-level descriptors. We investigate how different corpora influence each other, applying the mixup approach to data augmentation. We also study the influence of various acoustic context lengths on AD. Two-second speech fragments turn out to be sufficient for reliable AD. Mixup is shown to be beneficial for merging acoustic data (extracted features but not raw waveforms) from different domains that allows us to reach a higher classification performance on human-machine AD and also for training a multipurpose neural network that is capable of solving both human-machine and adult-child AD problems.", keywords={conferencebook,conference} }

@Inproceedings{SiegertESSV:2019, Title = {The Restaurant Booking Corpus -- content-identical comparative human-human and human-computer simulated telephone conversations}, Address = {Dresden, Germany}, Author = {Siegert, Ingo and Nietzold, Jannik and Heinemann, Ralph and Wendemuth, Andreas}, Booktitle = {Elektronische Sprachsignalverarbeitung 2019. Tagungsband der 30. Konferenz}, Pages = {126--133}, Year = {2019}, series = {Studientexte zur Sprachkommunikation}, publisher = {TUDpress}, volume = {93}, keywords={kongressbook,,kongress} }

@Inproceedings{RavehESSV2019, Title = {Comparing phonetic changes in computer-directed and human-directed speech}, Address = {Dresden, Germany}, Author = {Raveh, Eran and Steiner, Ingmar and Siegert, Ingo and Gessinger, Iona and Möbius, Bernd }, Booktitle = {Elektronische Sprachsignalverarbeitung 2019. Tagungsband der 30. Konferenz}, Pages = {42--49}, Year = {2019}, series = {Studientexte zur Sprachkommunikation}, publisher = {TUDpress}, volume = {93}, keywords={kongressbook,,kongress} }

@Inbook{BoeckAnticipating, author={Böck, Ronald and Egorow, Olga and Höbel-Müller, Juliane and Requardt, Alicia Flores and Siegert, Ingo and Wendemuth, Andreas }, title={Anticipating the user -- acoustic disposition recognition in intelligent interactions}, bookTitle={Innovations in big data mining and embedded knowledge}, year={2019}, publisher={Springer International Publishing}, address={Cham}, pages={203--233}, keywords={buch} }

@Inproceedings{JHMDAGA:2019, Title = {Analysis of the influence of different room acoustics on acoustic emotion features and emotion recognition performance}, Address = {Rostok, Germany}, Author = {Höbel-Müller, Juliane and Siegert, Ingo and Heinemann, Ralph and Requardt, Alicia Flores and Tornow, Michael and Wendemuth, Andreas }, Booktitle = {Tagungsband - DAGA 2019}, Pages = {886--889}, Year = {2019}, keywords={kongressbook,kongress} }

@Inproceedings{JHMESSV:2019, Title = {Analysis of the influence of different room acoustics on acoustic emotion features}, Address = {Dresden, Germany}, Author = {Höbel-Müller, Juliane and Siegert, Ingo and Heinemann, Ralph and Requardt, Alicia Flores and Tornow, Michael and Wendemuth, Andreas }, Booktitle = {Elektronische Sprachsignalverarbeitung 2019. Tagungsband der 30. Konferenz}, Pages = {156--163}, Year = {2019}, series = {Studientexte zur Sprachkommunikation}, publisher = {TUDpress}, volume = {93}, keywords={kongressbook,kongress} }

@Inproceedings{KogSy2019, author = {Siegert, Ingo and Weißkirchen, Norman and Wendemuth, Andreas}, Title = {Admitting the addressee-detection faultiness to improve the performance using a continous learning framework}, booktitle = {8. Workshop Kognitive Systeme: Mensch, Teams, Systeme und Automaten,}, YEAR = {2019}, address = {Duisburg}, pages={38--39}, keywords={other} }

@INPROCEEDINGS{9287127, author={{Hailu}, Nirayo and {Siegert}, Ingo and {Nürnberger}, Andreas}, booktitle={2020 IEEE 22nd International Workshop on Multimedia Signal Processing (MMSP)}, title={Improving Automatic Speech Recognition Utilizing Audio-codecs for Data Augmentation}, year={2020}, pages={1-5}, doi={10.1109/MMSP48831.2020.9287127}, keywords={conferencebook,conference} }

@INPROCEEDINGS{9209538, author={{Weißkirchen}, Norman and Vasudeva Reddy, Mainampati and {Wendemuth}, Andreas and {Siegert}, Ingo}, booktitle={2020 IEEE International Conference on Human-Machine Systems (ICHMS)}, title={Utilizing Computer Vision Algorithms to Detect and Describe Local Features in Images for Emotion Recognition from Speech}, year={2020}, pages={1-6}, doi={10.1109/ICHMS49158.2020.9209538}, keywords={conferencebook,conference} }

@Inbook{10.1007/978-3-030-60276-5_50, author={Siegert, Ingo and Sinha, Yamini and Jokisch, Oliver and Wendemuth, Andreas}, editor={Karpov, Alexey and Potapova, Rodmonga}, title={Recognition Performance of Selected Speech Recognition APIs -- A Longitudinal Study}, booktitle={Speech and Computer}, year={2020}, publisher={Springer International Publishing}, address={Cham}, pages={520--529}, abstract={Within the last five years, the availability and usability of interactive voice assistants have grown. Thereby, the development benefits mostly from the rapidly increased cloud-based speech recognition systems. Furthermore many cloud-based services, such as Google Speech API, IBM Watson, and Wit.ai, can be used for personal applications and transcription tasks. As these tasks vary in their domain, their complexity as well as in their interlocutor, it is challenging to select a suitable cloud-based speech recognition service. As the update-process of online-services can be completely handled in the back-end, client applications do not need to be updated and thus improved accuracies can be expected within certain periods. This paper contributes to the field of automatic speech recognition, by comparing the performance of speech recognition between the above-mentioned cloud-based systems on German samples of high-qualitative spontaneous human-directed and device-directed speech as well as noisy device-directed speech over a period of eight months.}, isbn={978-3-030-60276-5}, keywords={conferencebook,conference} }

@INPROCEEDINGS{QUITDrones2020, author={Jokisch, Oliver and Siegert, Ingo }, booktitle={Advances in sound and speech signal processing at the presence of drones},
title={Quiet Drones: a Symposium on Noise from UASs/UAVs},
month = {October}, year = {2020}, address = {Paris, France}, publisher = {INCEEUROPE}, keywords={conferencebook,conference} }

@proceedings{LEGAL2020, title = {Proceedings of the LREC 2020 Workshop on Legal and Ethical Issues in Human Language Technologies (LEGAL2020)}, author = {Choukri, Khalid and Linden, Kirster and Rigault, Mickael and Siegert, Ingo}, year = {2020}, publisher = {European Language Resources Association}, keywords={heraus} }

@INPROCEEDINGS{LREC2020WS, author={Siegert, Ingo and Silber-Varod, Vered and Kamocki, Pawel}, booktitle={GDPR - a game changer for acoustic interaction analyses},
title={Proceedings of the LREC 2020 Workshop on Legal and Ethical Issues in Human Language Technologies (LEGAL2020)},
month = {May}, year = {2020}, address = {Marseille, France}, publisher = {European Language Resources Association}, pages = {1-3}, keywords={conferencebook,conference} }

@INPROCEEDINGS{9211896, author={{Böhm},Felix and {Siegert}, Ingo and {Belyaev}, Alexander and {Diedrich},Christian}, booktitle={2020 25th IEEE International Conference on Emerging Technologies and Factory Automation (ETFA)}, title={An Analysis of the Applicability of VoiceXML as Basis for a Dialog Control Flow in Industrial Interaction Management}, year={2020}, pages={30-37}, doi={10.1109/ETFA46521.2020.9211896}, keywords={conferencebook,conference} }

@inproceedings{mci/Baumann2020, author = {Baumann, Timo AND Siegert, Ingo}, title = {Prosodic addressee-detection: ensuring privacy in always-on spoken dialog systems}, booktitle = {Mensch und Computer 2020 - Tagungsband}, year = {2020}, editor = {Alt, Florian AND Schneegass, Stefan AND Hornecker, Eva} , pages = { 195–198 } , doi = { 10.1145/3404983.3410021 }, publisher = {ACM}, address = {New York}, keywords={conferencebook,conference} }

@article{SIEGERT20201, title = {Personal data protection and academia: GDPR issues and multi-modal data-collections "in the wild"}, journal = {The Online Journal of Applied Knowledge Management: OJAKM}, volume = {20}, pages = {16 - 31}, year = {2020}, doi = {https://doi.org/10.36965/OJAKM.2020.8(1)16-31}, author = {Siegert, Ingo and Silber Varod, Vered and Carmi, Nehoray and Kamocki, Pawel}, keywords = {zeitung}, }

@InProceedings{siegert:2020:LREC, author = {Siegert, Ingo}, title = {“Alexa in the wild” – Collecting Unconstrained Conversations with a Modern Voice Assistant in a Public Environment}, booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference}, month = {May}, year = {2020}, address = {Marseille, France}, publisher = {European Language Resources Association}, pages = {608--612}, abstract = {Datasets featuring modern voice assistants such as Alexa, Siri, Cortana and others allow an easy study of human-machine interactions. But data collections offering an unconstrained, unscripted public interaction are quite rare. Many studies so far have focused on private usage, short pre-defined task or specific domains. This contribution presents a dataset providing a large amount of unconstrained public interactions with a voice assistant. Up to now around 40 hours of device directed utterances were collected during a science exhibition touring through Germany. The data recording was part of an exhibit that engages visitors to interact with a commercial voice assistant system (Amazon’s ALEXA), but did not restrict them to a specific topic. A specifically developed quiz was starting point of the conversation, as the voice assistant was presented to the visitors as a possible joker for the quiz. But the visitors were not forced to solve the quiz with the help of the voice assistant and thus many visitors had an open conversation. The provided dataset – Voice Assistant Conversations in the wild (VACW) – includes the transcripts of both visitors requests and Alexa answers, identified topics and sessions as well as acoustic characteristics automatically extractable from the visitors’ audio files.}, url = {https://www.aclweb.org/anthology/2020.lrec-1.76}, keywords={conferencebook,conference} }

@article{AkhtiamovMDPI:2020, title = {Using Complexity-Identical Human- and Machine-Directed Utterances to Investigate Addressee Detection for Spoken Dialogue Systems}, journal = {Sensors}, volume = {20}, number= {9}, pages = {2740}, year = {2020}, doi = {https://doi.org/10.3390/s20092740}, url = {https://www.mdpi.com/1424-8220/20/9/2740}, author = {Akhtiamov, Oleg and Siegert, Ingo and Karpov, Alexey and Minker, Wolfgang}, keywords = {zeitung}, }

@Inproceedings{HoebelDaga:2020, author = {Höbel-Müller, Juliane and Siegert, Ingo and Gottschalk, Martin and Heinemann, Ralph and Wendemuth, Andreas}, title = {Investigation of the influence of standing waves on distant speech emotion recognition }, booktitle = {Fortschritte der Akustik - DAGA 2020 (50 Jahre DAGA) }, year = {2020}, pages = {822--825}, keywords={kongressbook,kongress} }

@Inproceedings{JokischDaga:2020, author = {Jokisch, Oliver and Lösch, Enrico and Siegert, Ingo}, title = {Speech Communication at the Presence of Unmanned Aerial Vehicles}, booktitle = {Fortschritte der Akustik - DAGA 2020 (50 Jahre DAGA) }, year = {2020}, pages = {952--955}, keywords={kongressbook,kongress} }

@Inproceedings{JKISITG:2020, author = {Krüger, Julia and Siegert, Ingo}, title = {das ist schon gruselig so dieses Belauschtwerden - subjektives Erleben von Interaktionen mit Sprachassistenzsystemen zum Zwecke der Individualisierung}, booktitle = {Sprachassistenten - Anwendungen, Implikationen, Entwicklungen : ITG-Workshop : Magdeburg, 3. März, 2020}, year = {2020}, pages = {29}, notes= {Abstract}, keywords={other} }

@Inproceedings{KuzhipathalilITG:2020, author = {Kuzhipathalil, Adarsh and Thomas, Anto and Chand, Keerthana and Siegert, Ingo}, title = {Intelligent LSF-answering system -- an Alexa Skill}, booktitle = {Sprachassistenten - Anwendungen, Implikationen, Entwicklungen : ITG-Workshop : Magdeburg, 3. März, 2020}, year = {2020}, pages = {39}, notes= {Abstract}, keywords={other} }

@proceedings{WendemuthESSV:2020, title={Elektronische Sprachsignalverarbeitung 2020 - Tagungsband der 31. Konferenz Magdeburg}, author={Wendemuth, Andreas and Böck, Ronald and Siegert, Ingo}, year={2020}, publisher={TUDpress}, keywords={heraus} }

@proceedings{SiegertITG:2020, title = {Sprachassistenten - Anwendungen, Implikationen, Entwicklungen : ITG-Workshop : Magdeburg, 3. März, 2020}, author = {Siegert, Ingo and Möller, Sebastian}, year = {2020}, doi = {10.25673/32572.2}, publisher = {OvGU}, keywords={heraus}, }

@Inbook{SiegertKrueger:2020, author={Siegert, Ingo and Krüger, Julia}, editor={Phillips-Wren, Gloria and Esposito, Anna and Jain, Lakhmi C.}, title={``Speech Melody and Speech Content Didn't Fit Together''---Differences in Speech Behavior for Device Directed and Human Directed Interactions}, bookTitle={Advances in Data Science: Methodologies and Applications}, year={2021}, publisher={Springer International Publishing}, address={Cham}, pages={65--95}, abstract={Nowadays, a diverse set of addressee detection methods is discussed. Typically, wake words are used. But these force an unnatural interaction and are error-prone, especially in case of false positive classification (user says the wake up word without intending to interact with the device). Therefore, technical systems should be enabled to perform a detection of device directed speech. In order to enrich research in the field of speech analysis in HCI we conducted studies with a commercial voice assistant, Amazon's ALEXA (Voice Assistant Conversation Corpus, VACC), and complemented objective speech analysis with subjective self and external reports on possible differences in speaking with the voice assistant compared to speaking with another person. The analysis revealed a set of specific features for device directed speech. It can be concluded that speech-based addressing of a technical system is a mainly conscious process including individual modifications of the speaking style.}, doi={10.1007/978-3-030-51870-7_4}, keywords={chapter,buch} }

@ARTICLE{10.3389/fcomm.2020.611555, author={Siegert, Ingo and Niebuhr, Oliver}, title={Case Report: Women, Be Aware that Your Vocal Charisma can Dwindle in Remote Meetings}, journal={Frontiers in Communication}, volume={5}, pages={135}, year={2021}, url={https://www.frontiersin.org/article/10.3389/fcomm.2020.611555}, doi={10.3389/fcomm.2020.611555},
issn={2297-900X},
abstract={Remote meetings via Zoom, Skype, or Teams limit the range and richness of nonverbal communication signals. Not just because of the typically sub-optimal light, posture, and gaze conditions, but also because of the reduced speaker visibility. Consequently, the speaker’s voice becomes immensely important, especially when it comes to being persuasive and conveying charismatic attributes. However, to offer a reliable service and limit the transmission bandwidth, remote meeting tools heavily rely on signal compression. It has never been analyzed how this compression affects a speaker’s persuasive and overall charismatic impact. Our study addresses this gap for the audio signal. A perception experiment was carried out in which listeners rated short stimulus utterances with systematically varied compression rates and techniques. The scalar ratings concerned a set of charismatic speaker attributes. Results show that the applied audio compression significantly influences the assessment of a speaker’s charismatic impact and that, particularly female speakers seem to be systematically disadvantaged by audio compression rates and techniques. Their charismatic impact decreases over a larger range of different codecs; and this decrease is additionally also more strongly pronounced than for male speakers. We discuss these findings with respect to two possible explanations. The first explanation is signal-based: audio compression codecs could be generally optimized for male speech and, thus, degrade female speech more (particularly in terms of charisma-associated features). Alternatively, the explanation is in the ears of the listeners who are less forgiving of signal degradation when rating female speakers’ charisma.}, keywords={zeitung} }

@Inproceedings{SiegertNiebuhrESSV:2021, Title = {Speech Signal Compression Deteriorates Acoustic Cues to Perceived Speaker Charisma}, Address = {Berlin, Germany}, Author = {Siegert, Ingo and Niebuhr Oliver}, Booktitle = {Elektronische Sprachsignalverarbeitung 2021. Tagungsband der 32. Konferenz}, Pages = {1--10}, Year = {2021}, series = {Studientexte zur Sprachkommunikation}, publisher = {TUDpress}, volume = {99}, keywords={kongressbook,kongress} }

@Inproceedings{HarmonicESSV:2021, Title = {Audio and Video Processing of UAV-Based Signals -- The Harmonic Project}, Address = {Berlin, Germany}, Author = {Jokisch, Oliver and Strutz, Tilo and Leipnitz, Alexander and Siegert, Ingo and Ronzhin, Andrey}, Booktitle = {Elektronische Sprachsignalverarbeitung 2021. Tagungsband der 32. Konferenz}, Pages = {77--86}, Year = {2021}, series = {Studientexte zur Sprachkommunikation}, publisher = {TUDpress}, volume = {99}, keywords={kongressbook,kongress} }

@Inproceedings{SchmidtSiegertESSV:2021, Title = {Studie zur Lösbarkeit des Problems starker Pegelschwankungen im Home-Entertainment}, Address = {Berlin, Germany}, Author = {Schmidt, Georg and Siegert, Ingo}, Booktitle = {Elektronische Sprachsignalverarbeitung 2021. Tagungsband der 32. Konferenz}, Pages = {303--310}, Year = {2021}, series = {Studientexte zur Sprachkommunikation}, publisher = {TUDpress}, volume = {99}, keywords={kongressbook,kongress} }

@Inproceedings{SiegertESSV:2020, Title = {Does users' system evaluation influence speech behavior in HCI? - first insights from the engineering and psychological perspective}, Address = {Magdeburg, Germany}, Author = {Siegert, Ingo and Busch, Matthia and Krüger, Julia}, Booktitle = {Elektronische Sprachsignalverarbeitung 2020. Tagungsband der 31. Konferenz}, Pages = {241--248}, Year = {2020}, series = {Studientexte zur Sprachkommunikation}, publisher = {TUDpress}, volume = {95}, keywords={kongressbook,kongress} }

@Inproceedings{PetersenESSV:2020, Title = {Emergency Service - Sprachbasierte Klassifikation eingehender Anrufe in Ausnahmesituationen}, Address = {Magdeburg, Germany}, Author = {Petersen, Marcus and Niedrist, Karl-Heinz and Busch, Matthias and Marquardt, Florian and Siegert, Ingo }, Booktitle = {Elektronische Sprachsignalverarbeitung 2020. Tagungsband der 31. Konferenz}, Pages = {206--213}, Year = {2020}, series = {Studientexte zur Sprachkommunikation}, publisher = {TUDpress}, volume = {95}, keywords={kongressbook,kongress} }

@Inproceedings{GottschalkESSV:2020, Title = {Filtering-based analysis of spectral and temporal effects of room modes on low-level descriptors of emotionally coloured speech}, Address = {Magdeburg, Germany}, Author = {Gottschalk, Martin and Höbel, Juliane and Siegert, Ingo and Verhey, Jesko L. and Wendemuth, Andreas}, Booktitle = {Elektronische Sprachsignalverarbeitung 2020. Tagungsband der 31. Konferenz}, Pages = {219--226}, Year = {2020}, series = {Studientexte zur Sprachkommunikation}, publisher = {TUDpress}, volume = {95}, keywords={kongressbook,kongress} }

@Inproceedings{LoeschESSV:2020, Title = {Reduction of aircraft noise in UAV-based speech signal recordings by quantile based noise estimation}, Address = {Magdeburg, Germany}, Author = {Lösch, Enrico and Jokisch, Oliver and Leipnitz, Alexander and Siegert, Ingo }, Booktitle = {Elektronische Sprachsignalverarbeitung 2020. Tagungsband der 31. Konferenz}, Pages = {149--156}, Year = {2020}, series = {Studientexte zur Sprachkommunikation}, publisher = {TUDpress}, volume = {95}, keywords={kongressbook,kongress} }