Center for Robust Speech Systems

CRSS.bib

@conference{CP-ICASSP15-CI-ImageGuideMap-HA_JN_RG_RL_BD_JH_ET-0005843,
  title = {Image-Guided Customization of Frequency-Place Mapping in Cochlear Implants},
  author = {H. Ali and J. Noble and R. Gifford and R. Labadie and B. Dawant and John H.L. Hansen and E. Tobey},
  booktitle = {Proc. IEEE ICASSP-2015},
  year = {2015},
  file = {:CP-ICASSP15-CI-ImageGuideMap-HA_JN_RG_RL_BD_JH_ET-0005843.pdf:PDF},
  owner = {axm101521},
  pdf = {CP-ICASSP15-CI-ImageGuideMap-HA_JN_RG_RL_BD_JH_ET-0005843.pdf},
  timestamp = {2015.05.03}
}
@article{Boril2010,
  title = {Unsupervised Equalization of Lombard Effect for Speech Recognition in Noisy Adverse Environments},
  author = {Hynek Bo\v{r}il and John H. L. Hansen},
  journal = {IEEE Trans. Audio Speech Lang. Process.},
  year = {2010},
  month = {Aug.},
  number = {6},
  pages = {1379-1393},
  volume = {18},
  file = {Boril2010.pdf:Boril2010.pdf:PDF},
  owner = {kwg071000},
  pdf = {Boril2010.pdf},
  timestamp = {2009.10.12}
}
@inproceedings{Boril2009,
  title = {Unsupervised equalization of lombard effect for speech recognition in noisy adverse environment},
  author = {Hynek Bo\v{r}il and John H. L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {2009},
  abstract = {When exposed to environmental noise, speakers adjust their speech production to maintain intelligible communication. This phenomenon, called Lombard effect (LE), is known to considerably impact the performance of automatic speech recognition (ASR) systems. In this study, novel frequency and cepstral domain equalizations that reduce the impact of LE on ASR are proposed. Short-time spectra of LE speech are transformed towards neutral ASR models in a maximum likelihood fashion. Dynamics of cepstral coefficients are normalized to a constant range using quantile estimations. The algorithms are incorporated in a recognizer employing a codebook of noisy acoustic models. In a recognition task on connected Czech digits presented in various levels of background car noise, the resulting system provides an absolute reduction in word error rate (WER) on 10 dB SNR data of 8.7% and 37.7% for female neutral and LE speech, and of 8.7% and 32.8% for male neutral and LE speech when compared to the baseline system employing perceptual linear prediction (PLP) coefficients and cepstral mean and variance normalization. Index Terms- Lombard effect, speech recognition, frequency warping, cepstral compensation, codebook of noisy models},
  file = {Boril2009.pdf:Boril2009.pdf:PDF},
  owner = {kwg071000},
  pdf = {Boril2009.pdf},
  timestamp = {2009.07.22}
}
@inproceedings{Boril2012,
  title = {Arabic Dialect Identification - 'Is the Secret in the Silence?' and Other Observations},
  author = {Hynek Bo\v{r}il and Abhijeet Sangwan and John H. L. Hansen},
  booktitle = {Proc. Interspeech},
  year = {2012},
  abstract = {Conversational telephone speech (CTS) collections of Arabic dialects distributed trough the Linguistic Data Consortium (LDC) provide an invaluable resource for the development of robust speech systems including speaker and speech recognition, translation, spoken dialogue modeling, and information summarization. They are frequently relied on also in language (LID) and dialect identification (DID) evaluations. The first part of this study attempts to identify the source of the relatively high DID performance on LDC's Arabic CTS corpora seen in recent literature. It is found that recordings of each dialect exhibit unique channel and noise characteristics and that silence regions are sufficient for performing reasonably accurate DID. The second part focuses on phonotactic dialect modeling that utilizes phone recognizers and support vector machines (PRSVM). A simple N-gram normalization of PRSVM input supervectors utilizing hard limiting is introduced and shown to outperform the standard approach used in current LID and DID systems. Index Terms: Arabic dialect identification, channel characteristics, LDC corpora, PRSVM},
  file = {Boril2012.pdf:Boril2012.pdf:PDF},
  owner = {kwg071000},
  pdf = {Boril2012.pdf},
  timestamp = {2012.09.13}
}
@article{Bou-Ghazale2000,
  title = {A Comparative Study of Traditional and Newly proposed features for recognition of speech under stress},
  author = {Sahar E. Bou-Ghazale and John H. L. Hansen},
  journal = {IEEE Trans. Speech Audio Process.},
  year = {2000},
  month = {July},
  number = {4},
  pages = {429-442},
  volume = {8},
  file = {Bou-Ghazale2000.pdf:Bou-Ghazale2000.pdf:PDF},
  owner = {kwg071000},
  pdf = {Bou-Ghazale2000.pdf},
  timestamp = {2008.03.14}
}
@inproceedings{Bou-Ghazale1995,
  title = {A Source Generator Based Modeling Framework for Synthesis of Speech Under Stress},
  author = {Sahar E. Bou-Ghazale and John H. L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {1995},
  abstract = {The objective of this paper is to formulate an algorithm to generate stressed synthetic speech from neutral speech using a source generator framework previously employed for stressed speech recognition. The following goals are addressed (i) identify the most visible indicators of stress as perceived by the listener in stressed speaking styles such as loud, Lombard effect and angry, (ii) develop a mathematical model for representing speech production under stressed conditions, and (iii) employ the above model to produce emotional/stressed synthetic speech from neutral speech. The stress modeling scheme is applied to an existing low-bit rate CELP speech coder in order to investigate (i) the coder's ability and limitations reproducing stressed synthetic speech, and (ii) our ability to perturb coded neutral speech parameters at the synthesis stage so that resulting speech is perceived as being under stress. Two stress perturbation algorithms are proposed and evaluated. Results from formal listener evaluations show that 87% of neutral perturbed speech was indeed perceived as stressed},
  file = {Bou-Ghazale1995.pdf:Bou-Ghazale1995.pdf:PDF},
  owner = {kwg071000},
  pdf = {Bou-Ghazale1995.pdf},
  timestamp = {2008.10.14}
}
@inproceedings{BouGhazale1994,
  title = {Duration and Spectral Based Token Generation for {HMM} Speech recognition under stress},
  author = {Sahar E. Bou-Ghazale and John H. L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {1994},
  pages = {413-416},
  volume = {i},
  file = {BouGhazale1994.pdf:BouGhazale1994.pdf:PDF},
  owner = {kwg071000},
  pdf = {BouGhazale1994.pdf},
  timestamp = {2007.12.03}
}
@article{Cairns1994,
  title = {Nonlinear analysis and classification of speech under stressed conditions},
  author = {Douglas A. Cairns and John H. L. Hansen},
  journal = {J. Acoust. Soc. Am.},
  year = {1994},
  month = {December},
  number = {6},
  pages = {3392-3400},
  volume = {96},
  abstract = {The speech production system is capable of conveying an abundance of information with regards to sentence text, speaker identity, prosodics, as well as emotion and speaker stress. In an effort to better understand the mechanism of human voice communication, researchers have attempted to determine reliable acoustic indicators of stress using such speech production features as fundamental frequency (F0), intensity, spectral tilt, the distribution of spectral energy, and others. Their findings indicate that more work is necessary to propose a general solution. In this study, we hypothesize that speech consists of a linear and nonlinear component, and that the nonlinear component changes markedly between normal and stressed speech. To quantify the changes between normal and stressed speech, a classification procedure was developed based on the nonlinear Teager Energy operator. The Teager Energy operator provides an indirect means of evaluating the nonlinear component of speech. The system was tested using VC and CVC utterances from native speakers of English across the following speaking styles; neutral, loud, angry, Lombard effect, and clear. Results of the system evaluation show that loud and angry speech can be differentiated from neutral speech, while clear speech is more difficult to differentiate. Results also show that reliable classification of Lombard effect speech is possible, but system performance varies across speakers.},
  file = {Cairns1994.pdf:Cairns1994.pdf:PDF},
  owner = {kwg071000},
  pdf = {Cairns1994.pdf},
  timestamp = {2007.11.19}
}
@article{Das2012,
  title = {Constrained Iterative Speech Enhancement Using Phonetic Classes},
  author = {Amit Das and John H. L. Hansen},
  journal = {IEEE Trans. Audio Speech Lang. Process.},
  year = {2012},
  pages = {1869-1883},
  volume = {20},
  abstract = {The degree of influence of noise over phonemes is not uniform since it is dependent on their distinct acoustic properties. In this study, the problem of selectively enhancing speech based on broad phoneme classes is addressed using Auto-(LSP), a constrained iterative speech enhancement algorithm. Multiple enhanced utterances are generated for every noisy utterance by varying the Auto-LSP parameters. The noisy utterance is then partitioned into segments based on broad level phoneme classes, and constraints are applied on each segment using a hard decision solution. To alleviate the effect of hard decision errors, a Gaussian mixture model (GMM)-based maximum-likelihood (ML) soft decision solution is also presented. The resulting utterances are evaluated over the TIMIT speech corpus using the Itakura-Saito, segmental signal-to-noise ratio (SNR) and perceptual evaluation of speech quality (PESQ) metrics over four noise types at three SNR levels. Comparative assessment over baseline enhancement algorithms like Auto-LSP, log-minimum mean squared error (log-MMSE), and log-MMSE with speech presence uncertainty (log-MMSE-SPU) demonstrate that the proposed solution exhibits greater consistency in improving speech quality over most phoneme classes and noise types considered in this study. Index Terms - Auditory masked threshold, Auto-LSP, constrained iterative speech enhancement.},
  file = {Das2012.pdf:Das2012.pdf:PDF},
  owner = {kwg071000},
  pdf = {Das2012.pdf},
  timestamp = {2012.09.06}
}
@book{Deller2000,
  title = {Discrete-Time Processing of Speech Signals},
  author = {John R. Deller and John H. L. Hansen and John G. Proakis},
  publisher = {IEEE Press, Piscataway, NJ},
  year = {2000},
  owner = {kwg071000},
  timestamp = {2009.09.11}
}
@inproceedings{Fan2009,
  title = {Speaker identification with whispered speceh based on modified {LFCC} parameters and feature mapping},
  author = {Xing Fan and John H. L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {2009},
  abstract = {Much research recently in speaker recognition has been devoted to robustness due to microphone and channel effects. However, changes in vocal effort, especially whispered speech, present significant challenges in maintaining system performance. Due to the absence of any periodic excitation in whisper, the spectral structure in whisper and neutral speech will differ. Therefore, performance of speaker ID systems, trained mainly with high energy voiced phonemes, degrades when tested with whisper. This study considers a front-end feature compensation method for whispered speech to improve speaker recognition using a neutral trained system. First, an alternative feature vector with linear frequency cepstral coefficients (LFCC) is introduced based on spectral analysis from both speech modes. Next, for the first time a feature mapping is proposed for reducing whisper/neutral mismatch in speaker ID. Feature mapping is applied on a frame-by-frame basis between two speaker independent GMMs (Gaussian Mixture Models) of whispered and neutral speech. Text independent closed set speaker ID results show an absolute 20% improvement in accuracy when compared with a traditional MFCC feature based system. This result confirms a viable approach to improving speaker ID performance between neutral and whispered speech conditions. Index Terms: whisper, speaker identification, linear scale cepstrum coefficients, feature mapping},
  file = {Fan2009.pdf:Fan2009.pdf:PDF},
  owner = {kwg071000},
  pdf = {Fan2009.pdf},
  timestamp = {2009.07.22}
}
@conference{CP-ICASSP15-WhisperASR-SG_HB_JH-0005024,
  title = {Generative Modeling of Pseudo-Target Domain Adaptation Samples for Whispered Speech Recognition},
  author = {S. Ghaffarzadegan and Hynek Bo\v{r}il and John H.L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {2015},
  file = {:CP-ICASSP15-WhisperASR-SG_HB_JH-0005024.pdf:PDF},
  owner = {axm101521},
  pdf = {CP-ICASSP15-WhisperASR-SG_HB_JH-0005024.pdf},
  timestamp = {2015.05.03}
}
@inproceedings{godin2013sid,
  title = {Impact of noise reduction and spectrum estimation on noise robust speaker identification},
  author = {Keith Godin and Seyed Omid Sadjadi and John H.L. Hansen},
  booktitle = {Proc. INTERSPEECH},
  year = {2013},
  file = {:godin2013sid.pdf:PDF},
  owner = {axm101521},
  pdf = {godin2013sid.pdf},
  timestamp = {2013.09.12}
}
@inproceedings{Godin2011,
  title = {Vowel context and speaker interactions influencing glottal open quotient and formant frequency shifts in physical task stress},
  author = {Keith W. Godin and John H. L. Hansen},
  booktitle = {Proc. Interspeech},
  year = {2011},
  pages = {2945-2948},
  file = {Godin2011.pdf:Godin2011.pdf:PDF},
  owner = {kwg071000},
  pdf = {Godin2011.pdf},
  timestamp = {2011.12.10}
}
@article{Godin2011a,
  title = {Analysis of the effects of physical task stress on the speech signal},
  author = {Keith W. Godin and John H. L. Hansen},
  journal = {J. Acoust. Soc. Am.},
  year = {2011},
  pages = {3992-3998},
  volume = {130},
  abstract = {Physical task stress is known to affect the fundamental frequency and other measurements of the speech signal. A corpus of physical task stress speech is analyzed using a spectrum F-ratio and frame score distribution divergences. The measurements differ between phone classes, and are greater for vowels and nasals than for plosives and fricatives. In further analysis, frame score distribution divergences are used to measure the spectral dissimilarity between neutral and physical task stress speech. Frame scores are the log likelihood ratios between Gaussian mixture models (GMMs) of physical task stress and of neutral speech. Mel-frequency cepstral coefficients are used as the acoustic feature inputs to the GMMs. A Laplacian distribution is fitted to the frame scores for each of ten phone classes, and the symmetric Kullback-Leibler divergence is employed to measure the change in distribution from neutral to physical task stress. The results suggest that the spectral dissimilarity is greatest for the second level of a four level exertion measurement, and that spectral dissimilarity is greater for nasal phones than for plosives and fricatives. Further, the results suggest that different phone classes are affected differently by physical task stress.},
  file = {Godin2011a.pdf:Godin2011a.pdf:PDF},
  owner = {kwg071000},
  pdf = {Godin2011a.pdf},
  timestamp = {2012.03.02}
}
@inproceedings{Godin2010,
  title = {Session variability contrasts in the {MARP} corpus},
  author = {Keith W. Godin and John H. L. Hansen},
  booktitle = {Proc. Interspeech},
  year = {2010},
  pages = {298-301},
  file = {Godin2010.pdf:Godin2010.pdf:PDF},
  owner = {kwg071000},
  pdf = {Godin2010.pdf},
  timestamp = {2011.10.25}
}
@inproceedings{Godin2008,
  title = {Analysis and Perception of Speech Under Physical Task Stress},
  author = {Keith W. Godin and John H. L. Hansen},
  booktitle = {Proc. Interspeech},
  year = {2008},
  address = {Brisbane, Australia},
  month = {Sep.},
  pages = {1674-1677},
  abstract = {It is known that speech under physical task stress degrades speech system performance. Therefore, an analysis of speech under physical task stress is performed across several parameters to identify acoustic correlates. Formal listener tests are also performed to determine the relationship between acoustic correlates and perception. To verify the statistical significance of all results, student-t statistical tests are applied. It was found that fundamental frequency decreases for many speakers, that utterance duration increases for some speakers and decreases for others, and that the glottal waveform is quantifiably different for many speakers. Perturbation of two speech features, fundamental frequency and the glottal waveform, is applied in listener tests to quantify the degree to which these features convey physical stress content in speech. Finally, the enhanced understanding of physical task stress speech provided here is discussed in the context of speech systems. Index Terms: physical task stress, stress analysis},
  file = {Godin2008.pdf:Godin2008.pdf:PDF},
  owner = {kwg071000},
  pdf = {Godin2008.pdf},
  timestamp = {2008.10.09}
}
@inproceedings{Godin2012,
  title = {Glottal Waveform Analysis of Physical Task Stress Speech},
  author = {Keith W. Godin and Taufiq Hasan and John H. L. Hansen},
  booktitle = {Proc. Interspeech},
  year = {2012},
  owner = {kwg071000},
  timestamp = {2012.04.20}
}
@inproceedings{scenic2013sad,
  title = {All for One: Feature Combination for Highly Channel-Degraded Speech Activity Detection},
  author = {Martin Graciarena and Abeer Alwan and Dan Ellis and Horacio Franco and Luciana Ferrer and John H.L. Hansen and Adam Janin and Byung-Suk Lee and Yun Lei and Vikramjit Mitra and Nelson Morgan and Seyed Omid Sadjadi and T.J. Tsai and Nicolas Scheffer and Lee Ngee Tan and Benjamin Williams},
  booktitle = {Proc. INTERSPEECH},
  year = {2013},
  file = {:scenic2013sad.pdf:PDF},
  owner = {axm101521},
  pdf = {scenic2013sad.pdf},
  timestamp = {2013.10.15}
}
@article{Hansen1996a,
  title = {Analysis and compensation of speech under stress and noise for environmental robustness in speech recognition},
  author = {John H. L. Hansen},
  journal = {Speech Commun.},
  year = {1996},
  month = {Nov.},
  pages = {151-173},
  volume = {20},
  abstract = {It is well known that the introduction of acoustic background distortion and the variability resulting from environmentally induced stress causes speech recognition algorithms to fail. In this paper, several causes for recognition performance degradation are explored. It is suggested that recent studies based on a Source Generator Framework can provide a viable foundation in which to establish robust speech recognition techniques. This research encompasses three inter-related issues: (i) analysis and modeling of speech characteristics brought on by workload task stress, speaker emotion/stress or speech produced in noise (Lombard effect), (ii) adaptive signal processing methods tailored to speech enhancement and stress equalization, and (iii) formulation of new recognition algorithms which are robust in adverse environments. An overview of a statistical analysis of a Speech Under Simulated and Actual Stress (SUSAS) database is presented. This study was conducted on over 200 parameters in the domains of pitch, duration, intensity, glottal source and vocal tract spectral variations. These studies motivate the development of a speech modeling approach entitled Source Generator Framework in which to represent the dynamics of speech under stress. This framework provides an attractive means for performing feature equalization of speech under stress. In the second half of this paper, three novel approaches for signal enhancement and stress equalization are considered to address the issue of recognition under noisy stressful conditions. The first method employs (Auto:I,LSP:T) constrained iterative speech enhancement to address background noise and maximum likelihood stress equalization across formant location and bandwidth. The second method uses a feature enhancing artificial neural network which transforms the input stressed speech feature set during parameterization for keyword recognition. The final method employs morphological constrained feature enhancement to address noise and an adaptive Mel-cepstral compensation algorithm to equalize the impact of stress. Recognition performance is demonstrated for speech under a range of stress conditions, signal-to-noise ratios and background noise types.},
  file = {Hansen1996a.pdf:Hansen1996a.pdf:PDF},
  owner = {kwg071000},
  pdf = {Hansen1996a.pdf},
  timestamp = {2008.04.10}
}
@article{Hansen1994,
  title = {Morphological Constrained Feature Enhancement with Adaptive Cepstral Compensation (MCE-ACC) for Speech Recognition in Noise and Lombard Effect},
  author = {John H. L. Hansen},
  journal = {IEEE Trans. Speech Audio Proc.},
  year = {1994},
  number = {4},
  pages = {598-614},
  volume = {2},
  abstract = {The use of present-day speech recognition techniques in many practical applications has demonstrated the need for improved algorithm formulation under varying acoustical environments. This paper describes a low-vocabulary speech recognition algorithm that provides robust performance in noisy environments with particular emphasis on characteristics due to the Lombard effect. A neutral and stressed-based source generator framework is established to achieve improved speech parameter characterization using a morphological constrained enhancement algorithm and stressed source compensation, which is unique for each source generator across a stressed speaking class. The algorithm uses a noise-adaptive boundary detector to obtain a sequence of source generator classes, which is used to direct noise parameter enhancement and stress compensation. This allows the parameter enhancement and stress compensation schemes to adapt to changing speech generator types. A phonetic consistency rule is also employed based on input source generator partitioning. Algorithm performance evaluation is demonstrated for noise-free and nine noisy Lombard speech conditions that include additive white Gaussian noise, slowly varying computer fan noise, and aircraft cockpit noise. System performance is compared with a traditional discrete-observation recognizer with no embellishments. Recognition rates are shown to increase from an average 36.7% for a baseline recognizer to 74.7% for the new algorithm (a 38% improvement). The new algorithm is also shown to be more consistent, as demonstrated by a decrease in standard deviation of recognition from 21.1 to 11.9 and a reduction in confusable word-pairs under noisy, Lombard-effect stressed speaking conditions},
  file = {Hansen1994.pdf:Hansen1994.pdf:PDF},
  owner = {kwg071000},
  pdf = {Hansen1994.pdf},
  timestamp = {2008.10.09}
}
@inproceedings{Hansen1989,
  title = {Evaluation of Acoustic Correlates of Speech under stress for robust speech recognition},
  author = {John H. L. Hansen},
  booktitle = {Proc. Fifteenth Annual Northeast Bioengineering Conf.},
  year = {1989},
  address = {Boston},
  month = {March},
  pages = {31-32},
  abstract = {Results are presented of an investigation of how speech characteristics change under varying levels of stress with specific application to improving automatic isolated-word speech recognition. The evaluation focused on five speech analysis domains: pitch, glottal source, intensity, duration, and vocal tract shaping. Goodness-of-fit statistical tests were used to ascertain the significance of parameter variation in each domain. Results from analysis of pitch and glottal source spectrum are presented. The findings suggest that such parameter information can be used reliably to aid in automatic isolated-word speech recognition in noisy stressful environments.},
  file = {Hansen1989.pdf:Hansen1989.pdf:PDF},
  owner = {kwg071000},
  pdf = {Hansen1989.pdf},
  timestamp = {2008.03.14}
}
@phdthesis{Hansen1988,
  title = {Analysis and compensation of stressed and noisy speech with application to robust automatic recognition},
  author = {John H. L. Hansen},
  school = {Georgia Inst. Tech.},
  year = {1988},
  address = {Atlanta, GA},
  month = {July},
  file = {Hansen1988.pdf:Hansen1988.pdf:PDF},
  owner = {kwg071000},
  pdf = {Hansen1988.pdf},
  timestamp = {2008.03.24}
}
@article{Hansen1995a,
  title = {Robust feature-estimation and objective quality assessment for noisy speech recognition using the Credit Card corpus},
  author = {John H. L. Hansen and Levent M. Arslan},
  journal = {IEEE Trans. Speech Audio Process.},
  year = {1995},
  number = {3},
  pages = {169-184},
  volume = {3},
  abstract = {The introduction of acoustic background distortion into speech causes recognition algorithms to fail. In order to improve the environmental robustness of speech recognition in adverse conditions, a novel constrained-iterative feature-estimation algorithm is considered and shown to produce improved feature characterization in a variety of actual noise conditions. In addition, an objective measure based MAP estimator is formulated as a means of predicting changes in robust recognition performance at the speech feature extraction stage. The four measures considered include (i) NIST SNR; (ii) Itakura-Saito log-likelihood; (iii) log-area-ratio; (iv) the weighted-spectral slope measure. A continuous distribution, monophone based, hidden Markov model recognition algorithm is used for objective measure based MAP estimator analysis and recognition evaluation. Evaluations were based on speech data from the Credit Card corpus (CC-DATA). It is shown that feature enhancement provides a consistent level of recognition improvement for broadband, and low-frequency colored noise sources. As the stationarity assumption for a given noise source breaks down, the ability of feature enhancement to improve recognition performance decreases. Finally, the log-likelihood based MAP estimator was found to be the best predictor of recognition performance, while the NIST SNR based MAP estimator was found to be poorest recognition predictor across the 27 noise conditions considered},
  file = {Hansen1995a.pdf:Hansen1995a.pdf:PDF},
  owner = {kwg071000},
  pdf = {Hansen1995a.pdf},
  timestamp = {2011.10.03}
}
@article{Hansen1991,
  title = {Constrained iterative speech enhancement with application to speech recognition},
  author = {John H. L. Hansen and Mark A. Clements},
  journal = {IEEE Trans. Signal Process.},
  year = {1991},
  number = {4},
  pages = {795-805},
  volume = {39},
  abstract = {In this paper, an improved form of iterative speech enhancement for single channel inputs is formulated. The basis of the procedure is sequential maximum a posteriori estimation of the speech waveform and its all-pole parameters as originally formulated by Lim and Oppenheim, followed by imposition of constraints upon the sequence of speech spectra. The new approaches impose intraframe and interframe constaints on the input speech signal to ensure more speechlike formant trajectories, reduce frame-to-frame pole jitter, and effectively introduce a relaxation parameter to the iterative scheme. Recently discovered properties of the line spectral pair representation of speech allow for an efficient and direct procedure for application of many of the constraint requirements. Substantial improvement over the unconstrained method has been observed in a variety of domains. First, informal listener quality evaluation tests and objective speech quality measures demonstrate the technique's effectiveness for additive white Gaussian noise. A consistent terminating point for the iterative technique is also shown. Second, the algorithms have been generalized and successfully tested for noise which is nonwhite and slowly varying in characteristics. The current systems result in substantially improved speech quality and LPC parameter estimation in this context with only a minor increase in computational requirements. Third, the algorithms were evaluated with respect to improving automatic recognition of speech in the presence of additive noise, and shown to outperform other enhancement methods in this application.},
  file = {Hansen1991.pdf:Hansen1991.pdf:PDF},
  owner = {kwg071000},
  pdf = {Hansen1991.pdf},
  timestamp = {2011.04.15}
}
@inproceedings{Hansen1989a,
  title = {Stress Compensation and Noise Reduction Algorithms for Robust Speech Recognition},
  author = {John H. L. Hansen and Mark A. Clements},
  booktitle = {Proc. IEEE ICASSP},
  year = {1989},
  pages = {266-269},
  abstract = {The problem of speech recognition in noisy, stressful environments is addressed. The main contribution is the achievement of robust recognition in diverse environmental conditions through the formulation of a series of speech-enhancement and stress-compensation preprocessing algorithms. These preprocessors produce speech or recognition features less sensitive to varying factors caused by stress and noise. Results from four recognition scenarios based on such preprocessing are reported. Neutral, stressful, noisy neutral, and noisy stressful speech styles are considered. Noise reduction is based on constrained iterative speech enhancement. Stress compensation algorithms are based on formant location, bandwidth, and intensity. Enhancement preprocessing increases recognition by +34% for neutral speech, 18% for stressed speech. Combined stress compensation, speech enhancement preprocessing increases recognition rates by an average +27% (e.g., +43% loudly spoken speech, +42% speech spoken under Lombard effect). As a result, combined speech enhancement stress compensation preprocessing has been shown to be extremely effective in reducing the effects coused by stress and noise for robust speech recognition.},
  file = {Hansen1989a.pdf:Hansen1989a.pdf:PDF},
  owner = {kwg071000},
  pdf = {Hansen1989a.pdf},
  timestamp = {2009.07.15}
}
@inproceedings{Hansen1988a,
  title = {Constrained iterative speech enhancement with application to automatic speech recognition},
  author = {John H. L. Hansen and Mark A. Clements},
  booktitle = {Proc. IEEE ICASSP},
  year = {1988},
  pages = {561-564},
  abstract = {A set of iterative speech enhancement techniques using spectral constraints is extended and evaluated. The approaches apply inter- and intraframe spectral constraints to ensure optimum speech quality across all classes of speech. Constraints are applied on the basis of the presence of perceptually important speech characteristics found during the enhancement procedure. Results show improvement over past techniques for additive white noise distortions. Three points are addressed in the present study. First, a convenient and consistent terminating point for the iterative technique is presented which was previously unavailable. Second, the techniques have been generalized to allow for slowly varying, colored noise. Finally, a comparative evaluation has been performed to determine their usefulness as preprocessors for recognition in extremely noisy environments in the vicinity of 0 dB SNR.},
  file = {Hansen1988a.pdf:Hansen1988a.pdf:PDF},
  owner = {kwg071000},
  pdf = {Hansen1988a.pdf},
  timestamp = {2012.10.23}
}
@inproceedings{Hansen1987,
  title = {Iterative speech enhancement with spectral constraints},
  author = {John H. L. Hansen and Mark A. Clements},
  booktitle = {Proc. IEEE ICASSP},
  year = {1987},
  pages = {189-192},
  abstract = {A new and improved iterative speech enhancement technique based on spectral constraints is presented in this paper. The iterative technique, originally formulated by Lim and Oppenheim, attempts to solve for the maximum likelihood estimate of a speech waveform in additive white noise. The new approach applies inter- and intra-frame spectral constraints to ensure convergence to reasonable values and hence improve speech quality. An extremely efficient technique for applying these constraints is in the use of line spectral pair (LSP) coefficients. The inter-frame constraints ensures more speech-like formant trajectories than those found in the unconstrained approach. Results from speech degraded by additive white Gaussian noise show noticeable quality improvement.},
  file = {Hansen1987.pdf:Hansen1987.pdf:PDF},
  owner = {kwg071000},
  pdf = {Hansen1987.pdf},
  timestamp = {2012.10.22}
}
@article{Hansen1995,
  title = {Robust estimation of speech in noisy backgrouds based on aspects of the auditory process},
  author = {John H. L. Hansen and S. Nandkumar},
  journal = {J. Acoust. Soc. Am.},
  year = {1995},
  number = {6},
  pages = {3833-3849},
  volume = {97},
  abstract = {A new approach to speech enhancement is proposed where constraints based on aspects of the auditory process augment an iterative enhancement framework. The basic enhancement framework is based on a previously developed dual-channel scenario using a two-step iterative Wiener filtering algorithm. Constraints across broad speech sections and over iterations are then experimentally developed on a novel auditory representation derived by transforming the speech magnitude spectrum. The spectral transformations are based on modeling aspects of the human auditory process which include critical band filtering, intensity-to-loudness conversion, and lateral inhibition. The auditory transformations and perceptual based constraints are shown to result in a new set of auditory constrained and enhanced linear prediction (ACE-LP) parameters. The ACE-LP based speech spectrum is then incorporated into the iterative Wiener filtering framework. The improvements due to auditory constraints are demonstrated in several areas. The proposed auditory representation is shown to result in improved spectral characterization in background noise. The auditory constrained iterative enhancement (ACE-II) algorithm is shown to result in improved quality over all sections of enhanced speech. Adaptation of auditory based constraints to changing spectral characteristics over broad classes of speech is another novel aspect of the proposed algorithm. The consistency of speech quality improvement for the ACE-II algorithm is illustrated over time and across all phonemes classified over a large set of phonetically balanced sentences from the TIMIT database. This study demonstrates the application of auditory based perceptual properties of a human listener to speech enhancement in noise, resulting in improved and consistent speech quality over all regions of speech.},
  file = {Hansen1995.pdf:Hansen1995.pdf:PDF},
  owner = {kwg071000},
  pdf = {Hansen1995.pdf},
  timestamp = {2011.10.03}
}
@article{Hansen2009,
  title = {Analysis and Compensation of Lombard Speech Across Noise Type and Levels with Application to In-Set/Out-of-Set Speaker Recognition},
  author = {John H. L. Hansen and Vaishnevi Varadarajan},
  journal = {IEEE Trans. Audio Speech Lang. Process.},
  year = {2009},
  pages = {366-378},
  volume = {17},
  abstract = {Speech production in the presence of noise results in the Lombard Effect, which is known to have a serious impact on speech system performance. In this study, Lombard speech produced under different types and levels of noise is analyzed in terms of duration, energy histogram, and spectral tilt. Acoustic-phonetic differences are shown to exist between different "flavors" of Lombard speech based on analysis of trends from a Gaussian mixture model (GMM)-based Lombard speech type classifier. For the first time, the dependence of Lombard speech on noise type and noise level is established for the purposes of speech processing systems. Also, the impact of the different flavors of Lombard Effect on speech system performance is shown with respect to an in-set/out-of-set speaker recognition task. System performance is shown to degrade from an equal error rate (EER) of 7.0% under matched neutral training and testing conditions, to an average EER of 26.92% when trained with neutral and tested with Lombard Effect speech. Furthermore, improvement in the performance of in-set/out-of-set speaker recognition is demonstrated by adapting neutral speaker models with Lombard speech data of limited duration. Improved average EERs of 4.75% and 12.37% were achieved for matched and mismatched adaptation and testing conditions, respectively. At the highest noise levels, an EER as low as 1.78% was obtained by adapting neutral speaker models with Lombard speech of limited duration. The study therefore illustrates the impact of Lombard Effect on speaker recognition, and effective methods to improve system performance for speaker recognition when train/test conditions are mismatched for neutral versus Lombard Effect speech.},
  file = {Hansen2009.pdf:Hansen2009.pdf:PDF},
  owner = {kwg071000},
  pdf = {Hansen2009.pdf},
  timestamp = {2009.06.29}
}
@article{Hansen1996,
  title = {Feature Analysis and Neural Network-Based Classification of Speech Under Stress},
  author = {John H L Hansen and Brian D. Womack},
  journal = {IEEE Trans. Speech Audio Process.},
  year = {1996},
  month = {July},
  number = {4},
  pages = {307-313},
  volume = {4},
  abstract = {It is well known that the variability in speech production due to task-induced stress contributes significantly to loss in speech processing algorithm performance. If an algorithm could be formulated that detects the presence of stress in speech, then such knowledge could be used to monitor speaker state, improve the naturalness of speech coding algorithms, or increase the robustness of speech recognizers. The goal in this study is to consider several speech features as potential stress-sensitive relayers using a previously established stressed speech database (SUSAS). The following speech parameters are considered: mel, delta-mel, delta-delta-mel, auto-correlation-mel, and cross-correlation-mel cepstral parameters. Next, an algorithm for speaker-dependent stress classification is formulated for the 11 stress conditions: angry, clear, cond50, cond70, fast, Lombard, loud, normal, question, slow, and soft. It is suggested that additional feature variations beyond neutral conditions reflect the perturbation of vocal tract articulator movement under stressed conditions. Given a robust set of features, a neural network-based classifier is formulated based on an extended delta-bar-delta learning rule. The performance is considered for the following three test scenarios: monopartition (nontargeted) and tripartition (both nontargeted and targeted) input feature vectors},
  file = {Hansen1996.pdf:Hansen1996.pdf:PDF},
  owner = {kwg071000},
  pdf = {Hansen1996.pdf},
  timestamp = {2007.10.18}
}
@inproceedings{Hasan2012,
  title = {Factor Analysis of Acoustic Features using a Mixture of Probabilistic Principal Component Analyzers for robust Speaker Verification},
  author = {Taufiq Hasan and John H. L. Hansen},
  booktitle = {Proc. Odyssey},
  year = {2012},
  abstract = {Robustness due to mismatched train/test conditions is one of the biggest challenges facing speaker recognition today, with transmission channel/handset and additive noise distortion being the most prominent factors. One limitation of the recent speaker recognition systems is that they are based on a latent factor analysis modeling of the GMM mean super-vectors alone. Motivated by the covariance structure of cepstral features, in this study, we develop a factor analysis model in the acoustic feature space instead of the super-vector domain. The proposed technique computes a mixture dependent feature dimensionality reduction transform and is directly applied to the first order Baum-Welch statistics for effective integration with a conventional i-vector-PLDA system. Experimental results on the telephone trials of the NIST SRE 2010 demonstrate the superiority of the proposed scheme.},
  file = {Hasan2012.pdf:Hasan2012.pdf:PDF},
  owner = {kwg071000},
  pdf = {Hasan2012.pdf},
  timestamp = {2012.08.29}
}
@inproceedings{Hasan2012a,
  title = {Front-end Channel Compensation using Mixture-dependent Feature Transformations for i-Vector Speaker Recognition},
  author = {Taufiq Hasan and John H. L. Hansen},
  booktitle = {Proc. Interspeech},
  year = {2012},
  abstract = {State-of-the-art session variability compensation for speaker recognition are generally based on various linear statistical models of the Gaussian Mixture Model (GMM) mean super-vectors, while front-end features are only processed by standard normalization techniques. In this study, we propose a front-end channel compensation frame-work using mixture-localized linear transforms that operate before super-vector domain modeling begins. In this approach, local linear transforms are trained for each Gaussian component of a Universal Background Model (UBM), and are applied to acoustic features according to their mixture-wise probabilistic alignment, yielding an operation that is globally non-linear. We examine Principal Component Analysis (PCA), whitening, Linear Discriminant Analysis (LDA) and Nuisance Attribute Projection (NAP) as front-end feature transformations. We also propose a method, Nuisance Attribute Elimination (NAE), which is similar to NAP but performs dimensionality reduction in addition to channel compensation. We show that the proposed frame-work can be readily integrated with a standard i-Vector system by simply applying the transformations on the first order Baum-Welch statistics and transforming the UBM. Experiments performed on the telephone trials of the NIST SRE 2010 demonstrate significant performance gain from the proposed frame-work, especially using LDA as the front-end transformation.},
  file = {Hasan2012a.pdf:Hasan2012a.pdf:PDF},
  owner = {kwg071000},
  pdf = {Hasan2012a.pdf},
  timestamp = {2012.09.13}
}
@inproceedings{Hasan2012b,
  title = {Integrated Feature Normalization and Enhancement for robust Speaker Recognition using Acoustic Factor Analysis},
  author = {Taufiq Hasan and John H. L. Hansen},
  booktitle = {Proc. Interspeech},
  year = {2012},
  abstract = {State-of-the-art factor analysis based channel compensation methods for speaker recognition are based on the assumption that speaker/utterance dependent Gaussian Mixture Model (GMM) mean super-vectors can be constrained to lie in a lower dimensional subspace, which does not consider the fact that conventional acoustic features may also be constrained in a similar way in the feature space. In this study, motivated by the low-rank covariance structure of cepstral features, we propose a factor analysis model in the acoustic feature space instead of the super-vector domain and derive a mixture dependent feature transformation. We demonstrate that, the proposed Acoustic Factor Analysis (AFA) transformation performs feature dimensionality reduction, de-correlation, variance normalization and enhancement at the same time. The transform applies a square-rootWiener gain on the acoustic feature eigenvector directions, and is similar to the signal sub-space based speech enhancement schemes. We also propose several methods of adaptively selecting the AFA parameter for each mixture. The proposed feature transform is applied using a probabilistic mixture alignment, and is integrated with a conventional i-Vector system. Experimental results on the telephone trials of the NIST SRE 2010 demonstrate the effectiveness of the proposed scheme.},
  file = {Hasan2012b.pdf:Hasan2012b.pdf:PDF},
  owner = {kwg071000},
  pdf = {Hasan2012b.pdf},
  timestamp = {2012.09.24}
}
@inproceedings{Hasan2010,
  title = {A NOVEL FEATURE SUB-SAMPLING METHOD FOR EFFICIENT UNIVERSAL BACKGROUND MODEL TRAINING IN SPEAKER VERIFICATION},
  author = {Taufiq Hasan and Yun Lei and Aravind Chandrasekaran and John H. L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {2010},
  pages = {4494-4497},
  abstract = {Speaker recognition/verification systems require an extensive universal background model (UBM), which typically requires extensive resources, especially if new channel domains are considered. In this study we propose an effective and computationally efficient algorithm for training the UBM for speaker verification. A novel method based on Euclidean distance between features is developed for effective sub-sampling of potential training feature vectors. Using only about 1.5 seconds of data from each development utterance, the proposed UBM training method drastically reduces the computation time, while improving, or at least retaining original speaker verification system performance. While methods such as factor analysis can mitigate some of the issues associated with channel/microphone/environmental mismatch, the proposed rapid UBM training scheme offers a viable alternative for rapid environment dependent UBMs. Index Terms- Speaker verification, universal background model.},
  file = {Hasan2010.pdf:Hasan2010.pdf:PDF},
  owner = {kwg071000},
  pdf = {Hasan2010.pdf},
  timestamp = {2010.03.22}
}
@inproceedings{Hasan2013,
  title = {{CRSS} Systems For 2012 {NIST} Speaker Recognition Evaluation},
  author = {Taufiq Hasan and Seyed Omid Sadjadi and Gang Liu and Navid Shokouhi and Hynek Bo\v{r}il and John H.L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {2013},
  abstract = {This paper describes the systems developed by the Center for Robust Speech Systems (CRSS), for the 2012 National Institute of Standards and Technology (NIST) Speaker Recognition Evalu- ation (SRE). Given that the emphasis of SRE’12 is on noisy and short duration test conditions, our system development focused on: (i) novel robust acoustic features, (ii) new feature normalization schemes, (iii) various back-end strategies utilizing multi-session and multi-condition training, and (iv) quality measure based system fusion. Noisy and short duration training/test conditions are artifi- cially generated and effectively utilized. Active speech duration and signal-to-noise-ratio (SNR) estimates are successfully employed as quality measures for system calibration and fusion. Overall system performance was very successful for the given test conditions},
  owner = {axm101521},
  pdf = {crss_system_paper_icassp.pdf},
  timestamp = {2013.07.11}
}
@conference{CP-ICASSP15-CI-ASR-TTS-OH_SG_JH-0005093,
  title = {Leveraging Speech Recognition in Cochlear Implants for Improved Speech Intelligibility under Reverberation},
  author = {O. Hazrati and S. Ghaffarzadegan and John H.L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {2015},
  file = {:CP-ICASSP15-CI-ASR-TTS-OH_SG_JH-0005093.pdf:PDF},
  owner = {axm101521},
  pdf = {CP-ICASSP15-CI-ASR-TTS-OH_SG_JH-0005093.pdf},
  timestamp = {2015.05.03}
}
@article{Huang2006,
  title = {Advances in unsupervised audio classification and segmentation for the broadcast news and NGSW corpora},
  author = {Rongqing Huang and John H. L. Hansen},
  journal = {IEEE Trans. Audio Speech Lang. Process.},
  year = {2006},
  month = {May},
  number = {3},
  pages = {907-919},
  volume = {14},
  abstract = {The problem of unsupervised audio classification and segmentation continues to be a challenging research problem which significantly impacts automatic speech recognition (ASR) and spoken document retrieval (SDR) performance. This paper addresses novel advances in 1) audio classification for speech recognition and 2) audio segmentation for unsupervised multispeaker change detection. A new algorithm is proposed for audio classification, which is based on weighted GMM Networks (WGN). Two new extended-time features: variance of the spectrum flux (VSF) and variance of the zero-crossing rate (VZCR) are used to preclassify the audio and supply weights to the output probabilities of the GMM networks. The classification is then implemented using weighted GMM networks. Since historically there have been no features specifically designed for audio segmentation, we evaluate 16 potential features including three new proposed features: perceptual minimum variance distortionless response (PMVDR), smoothed zero-crossing rate (SZCR), and filterbank log energy coefficients (FBLC) in 14 noisy environments to determine the best robust features on the average across these conditions. Next, a new distance metric, 2-mean, is proposed which is intended to improve segmentation for short segment turns (i.e., 1-5 s). A new false alarm compensation procedure is implemented, which can compensate the false alarm rate significantly with little cost to the miss rate. Evaluations on a standard data set Defense Advanced Research Projects Agency (DARPA) Hub4 Broadcast News 1997 evaluation data show that the WGN classification algorithm achieves over a 50% improvement versus the GMM network baseline algorithm, and the proposed compound segmentation algorithm achieves 23%-10% improvement in all metrics versus the baseline Mel-frequency cepstral coefficients (MFCC) and traditional Bayesian information criterion (BIC) algorithm. The new classification and segmentation algorithms also obtain very satisfactory results on the more diverse and challenging National Gallery of the Spoken Word (NGSW) corpus.},
  file = {Huang2006.pdf:Huang2006.pdf:PDF},
  owner = {kwg071000},
  pdf = {Huang2006.pdf},
  review = {PMVDR FBLC},
  timestamp = {2007.09.26}
}
@inproceedings{Huang2004,
  title = {Advances in unsupervised audio segmentation for the broadcast news and NGSW corpora},
  author = {Rongqing Huang and John H. L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {2004},
  month = {17-21 May},
  pages = {I--741--4vol.1},
  volume = {1},
  doi = {10.1109/ICASSP.2004.1326092},
  file = {Huang2004.pdf:Documents/Research/Reference/Huang2004.pdf:PDF},
  owner = {kwg071000},
  pdf = {Huang2004.pdf},
  review = {PMVDR FES Hotelling T^2},
  timestamp = {2007.09.26}
}
@inproceedings{Ikeno2007,
  title = {{UT-Scope}: Speech under Lombard effect and cognitive stress},
  author = {Ayako Ikeno and Vaishnevi Varadarajan and Sanjay Patil and John H. L. Hansen},
  booktitle = {Proc. IEEE Aerospace Conf.},
  year = {2007},
  address = {Big Sky, Montana},
  pages = {1-7},
  abstract = {This paper presents UT-Scope data base, and automatic and perceptual an evaluation of Lombard speech in In-Set Speaker Recognition. The speech used for the analysis forms a part of the UT-SCOPE database and consists of sentences from the well-known TIMIT corpus, spoken in the presence of highway, large crowd and pink noise. First, the deterioration of the EER of an in-set speaker identification system trained on neutral and tested with Lombard speech is illustrated. A clear demarcation between the effect of noise and Lombard effect on noise is also given by testing with noisy Lombard speech. The effect of test-token duration on system performance under the Lombard condition is addressed. We also report results from In-Set Speaker Recognition tasks performed by human subjects in comparison to the system performance. Overall observations suggest that deeper understanding of cognitive factor involved in perceptual speaker ID offers meaningful insights for further development of automated systems.},
  file = {Ikeno2007.pdf:Ikeno2007.pdf:PDF},
  owner = {kwg071000},
  pdf = {Ikeno2007.pdf},
  timestamp = {2007.10.04}
}
@article{Jensen2001,
  title = {Speech Enhancement Using a Constrained Iterative Sinusoidal Model},
  author = {Jesper Jensen and John H. L. Hansen},
  journal = {IEEE Trans. Speech Audio Process.},
  year = {2001},
  pages = {731-740},
  volume = {9},
  abstract = {This paper presents a sinusoidal model based algorithm for enhancement of speech degraded by additive broad-band noise. In order to ensure speech-like characteristics observed in clean speech, smoothness constraints are imposed on the model parameters using a spectral envelope surface (SES) smoothing procedure. Algorithm evaluation is performed using speech signals degraded by additive white Gaussian noise. Distortion as measured by objective speech quality scores showed a 34%-41% reduction over a SNR range of 5-to-20 dB. Objective and subjective evaluations also show considerable improvement over traditional spectral subtraction and Wiener filtering based schemes. Finally, in a subjective AB preference test, where enhanced signals were coded with the G729 codec, the proposed scheme was preferred over the traditional enhancement schemes tested for SNRs in the range of 5 to 20 dB.},
  file = {Jensen2001.pdf:Jensen2001.pdf:PDF},
  owner = {kwg071000},
  pdf = {Jensen2001.pdf},
  timestamp = {2012.10.10}
}
@inproceedings{Kim2009a,
  title = {Robust Angry Speech Detection Employing a TEO-Based Discriminative Classifier Combination},
  author = {Wooil Kim and John H. L. Hansen},
  booktitle = {Proc. Interspeech},
  year = {2009},
  pages = {2019-2022},
  abstract = {This study proposes an effective angry speech detection approach employing the TEO-based feature extraction. Decorrelation processing is applied to the TEO-based feature to increase model training ability by decreasing the correlation between feature elements and vector size. Minimum classification error training is employed to increase the discrimination between the angry speech model and other stressed speech models. Combination with the conventional Mel frequency cepstral coefficients (MFCC) is also employed to leverage the effectiveness of MFCC to characterize the spectral envelope of speech signals. Experimental results over the SUSAS corpus demonstrate the proposed angry speech detection scheme is effective at increasing detection accuracy on an open-speaker and open-vocabulary task. An improvement of up to 7.78% in classification accuracy is obtained by combination of the proposed methods including decorrelation of TEO-based feature vector, discriminative training, and classifier combination.},
  file = {Kim2009a.pdf:Kim2009a.pdf:PDF},
  owner = {kwg071000},
  pdf = {Kim2009a.pdf},
  timestamp = {2012.03.02}
}
@conference{CP-ICASSP15-CI-LENA-JL_HA_AZ_JH-0005132,
  title = {Analysis of Speech and Language Communication for Cochlear Implant Users in Noisy Lombard Conditions},
  author = {J. Lee and H. Ali and A. Ziaei and John H.L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {2015},
  file = {:CP-ICASSP15-CI-LENA-JL_HA_AZ_JH-0005132.pdf:PDF},
  owner = {axm101521},
  pdf = {CP-ICASSP15-CI-LENA-JL_HA_AZ_JH-0005132.pdf},
  timestamp = {2015.05.03}
}
@inproceedings{Lei2010a,
  title = {Speaker Recognition using Supervised Probabilistic Principal Component Analysis},
  author = {Yun Lei and John H.L. Hansen},
  booktitle = {Proc. Interspeech},
  year = {2010},
  pages = {382-385},
  abstract = {In this study, a supervised probabilistic principal component analysis (SPPCA) model is proposed in order to integrate the speaker label information into a factor analysis approach us- ing the well-known probabilistic principal component analysis (PPCA) model under a support vector machine (SVM) frame- work. The latent factor from the proposed model is believed to be more discriminative than one from the PPCA model. The proposed model, combined with different types of intersession compensation techniques in the back-end, is evaluated using the National Institute of Standards and Technology (NIST) Speaker Recognition Evaluation (SRE) 2008 data corpus, along with a comparison to the PPCA model.},
  file = {Lei2010a.pdf:Lei2010a.pdf:PDF},
  owner = {kwg071000},
  pdf = {Lei2010a.pdf},
  timestamp = {2011.01.27}
}
@inproceedings{Lei2009,
  title = {The Role of Age in Factor Analysis for Speaker Identification},
  author = {Yun Lei and John H. L. Hansen},
  booktitle = {Proc. Interspeech},
  year = {2009},
  file = {Lei2009.pdf:Lei2009.pdf:PDF},
  owner = {kwg071000},
  pdf = {Lei2009.pdf},
  timestamp = {2010.11.15}
}
@inproceedings{Lei2010,
  title = {The CRSS Systems for the 2010 NIST Speaker Recognition Evaluation},
  author = {Yun Lei and Taufiq Hasan and Jun-Won Suh and Abhijeet Sangwan and Hynek Boril and Liu Gang and Keith Godin and John H. L. Hansen},
  booktitle = {Proc. NIST SRE},
  year = {2010},
  file = {Lei2010.pdf:Lei2010.pdf:PDF},
  owner = {kwg071000},
  pdf = {Lei2010.pdf},
  timestamp = {2010.11.15}
}
@inproceedings{Liu2010,
  title = {A Novel Feature Extraction Strategy for Multi-Stream Robust Emotion Identification},
  author = {Gang Liu and Yun Lei and John H. L. Hansen},
  booktitle = {Proc. Interspeech},
  year = {2010},
  pages = {482-485},
  abstract = {In this study, we investigate an effective feature extraction front-end for improved emotion identification by speech in clean and noisy condition. First, we explore the application of the PMVDR feature for emotion characterization. Originally for accent/dialect and language identification (LID), PMVDR features are less sensitive to noise. Also developed for LID, shifted delta cepstral (SDC) approach can also be used as a means of incorporating additional temporal information about the speech into the feature vectors. As already known, super-segmental characteristics, such as pitch and intensity, can provide beneficial information to emotion recognition and we believe the improvement can be acquired from improved features. We performed evaluation on the Berlin database of emotion speech. The proposed system, PMVDR-SDC, outperforms the baseline system absolutely by 10.1%, which proves the validity of the approach. Furthermore, we find both PMVDR and SDC offers much better robustness in noisy condition than others, which is critical for the real application.},
  file = {Liu2010.pdf:Liu2010.pdf:PDF},
  owner = {kwg071000},
  pdf = {Liu2010.pdf},
  timestamp = {2012.03.06}
}
@inproceedings{Liu2012,
  title = {A fast speaker verification with universal background support data selection},
  author = {Gang Liu and Jun-Won Suh and John HL Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {2012},
  abstract = {In this study, a fast universal background support imposter data selection method is proposed, which is integrated within a support vector machine (SVM) based speaker verification system. Selection of an informative background dataset is crucial in constructing a discriminative decision super-plane between the enrollment and imposter speakers. Previous studies generally derive the optimal number of imposter examples from development data and apply to the evaluation data, which cannot guarantee consistent performance and often necessitate expensive searching. In the proposed method, the universal background dataset is derived so as to embed imposter knowledge in a more balanced way. Next, the derived dataset is taken as the imposter set in the SVM modeling process for each enrollment speaker. By using imposter adaptation, a more detailed subspace per target speaker can be constructed. Compared to the popular support-vector frequency based method, the proposed method can not only avoid parameter searching but offers a significant improvement and generalizes better on the unseen data.},
  file = {Liu2012.pdf:Liu2012.pdf:PDF},
  owner = {gxl083000},
  pdf = {Liu2012.pdf},
  timestamp = {2013.04.12}
}
@conference{2014_Odyssey_CRSS_iv_ML_system_description_Final,
  title = {CRSS systems for the NIST i-Vector Machine Learning Challenge},
  author = {Gang Liu and Chengzhu Yu and Navid Shokouhi and Abhinav Misra and Hua Xing and John H.L. Hansen},
  booktitle = {Proc. Odyssey},
  year = {2014},
  owner = {axm101521},
  pdf = {2014_Odyssey_CRSS_iv_ML_system_description_Final.pdf},
  timestamp = {2016.04.30}
}
@conference{CP-SLT-2014_iVectorSID-GL-CY-NS-AM-HX-JH-0000418,
  title = {Utilization of Unlabeled Development Data for Speaker Verification},
  author = {G. Liu and C. Yu and N. Shokouhi and A. Misra and H. Xing and John H.L. Hansen},
  booktitle = {Proc. SLT},
  year = {2014},
  file = {:CP-SLT-2014_iVectorSID-GL-CY-NS-AM-HX-JH-0000418.pdf:PDF},
  owner = {axm101521},
  pdf = {CP-SLT-2014_iVectorSID-GL-CY-NS-AM-HX-JH-0000418.pdf},
  timestamp = {2015.05.03}
}
@inproceedings{Liu2012a,
  title = {A Linguistic Data Acquisition Front-End for Language Recognition Evaluation},
  author = {Gang Liu and Chi Zhang and John H.L. Hansen},
  booktitle = {Proc. Odyssey},
  year = {2012},
  abstract = {One of the major challenges of the language identification (LID) system comes from the sparse training data. Manually collecting the linguistic data through the controlled studio is usually expensive and impractical. But multilingual broadcast programs (Voice of America, for instance) can be collected as a reasonable alternative to the linguistic data acquisition issue. However, unlike studio collected linguistic data, broadcast programs usually contain many contents other than pure linguistic data: musical contents in foreground/background, commercials, noise from practical life. In this study, a systematic processing approach is proposed to extract the linguistic data from the broadcast media. The experimental results obtained on NIST LRE 2009 data show that the proposed method can provide 22.2% relative improvement of segmentation accuracy and 20.5% relative improvement of LID accuracy.},
  file = {Liu2012a.pdf:Liu2012a.pdf:PDF},
  owner = {gxl083000},
  pdf = {Liu2012a.pdf},
  timestamp = {2013.04.14}
}
@conference{CP-SLT-2014_DistantASR-MM-JH-0000507,
  title = {Multichannel Feature Enhancement in Distributed Microphone Arrays for Robust Distant Speech Recognition in Smart Rooms},
  author = {S.M. Mirsamadi and John H.L. Hansen},
  booktitle = {Proc. IEEE SLT},
  year = {2014},
  file = {:CP-SLT-2014_DistantASR-MM-JH-0000507.pdf:PDF},
  owner = {axm101521},
  pdf = {CP-SLT-2014_DistantASR-MM-JH-0000507.pdf},
  timestamp = {2015.05.03}
}
@conference{CP-SLT-2014_LangMismatchSID-AM-JH-0000372,
  title = {Spoken Language Mismatch in Speaker Verification: An Investigation with NIST-SRE and CRSS Bi-Ling Corpora},
  author = {A. Misra and John H.L. Hansen},
  booktitle = {Proc. IEEE SLT},
  year = {2014},
  file = {:CP-SLT-2014_LangMismatchSID-AM-JH-0000372.pdf:PDF},
  owner = {axm101521},
  pdf = {CP-SLT-2014_LangMismatchSID-AM-JH-0000372.pdf},
  timestamp = {2015.05.03}
}
@conference{CP-ICASSP15-Scream-MN_AZ_JH-0000161,
  title = {Robust Unsupervised Detection of Human Screams in Noisy Acoustic Environments},
  author = {M.K.. Nandwana and A. Ziaei and John H.L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {2015},
  file = {:CP-ICASSP15-Scream-MN_AZ_JH-0000161.pdf:PDF},
  owner = {axm101521},
  pdf = {CP-ICASSP15-Scream-MN_AZ_JH-0000161.pdf},
  timestamp = {2015.05.03}
}
@article{Patil2010,
  title = {The physiological microphone ({PMIC}): A competitive alternative for speaker assessment in stress detection and speaker verification},
  author = {Sanjay A. Patil and John H. L. Hansen},
  journal = {Speech Commun.},
  year = {2010},
  pages = {327-340},
  volume = {52},
  abstract = {Interactive speech system scenarios exist which require the user to perform tasks which exert limitations on speech production, thereby causing speaker variability and reduced speech performance. In noisy stressful scenarios, even if noise could be completely eliminated, the production variability brought on by stress, including Lombard effect, has a more pronounced impact on speech system performance. Thus, in this study we focus on the use of a silent speech interface (PMIC), with a corresponding experimental assessment to illustrate its utility in the tasks of stress detection and speaker verification. This study focuses on the suitability of PMIC versus close-talk microphone (CTM), and reports that the PMIC achieves as good performance as CTM or better for a number of test conditions. PMIC reflects both stress-related information and speaker-dependent information to a far greater extent than the CTM. For stress detection performance (which is reported in % accuracy), PMIC performs at least on par or about 2% better than the CTM-based system. For a speaker verification application, the PMIC outperforms CTM for all matched stress conditions. The performance reported in terms of %EER is 0.91% (as compared to 1.69%), 0.45% (as compared to 1.49%), and 1.42% (as compared to 1.80%) for PMIC. This indicates that PMIC reflects speaker-dependent information. Also, another advantage of the PMIC is its ability to record the user physiology traits/state. Our experiments illustrate that PMIC can be an attractive alternative for stress detection as well as speaker verification tasks along with an advantage of its ability to record physiological information, in situations where the use of CTM may hinder operations (deep sea divers, fire-fighters in rescue operations, etc.). Keywords: Physiological sensor; Stress detection; Speaker verification; Non-acoustic sensor; PMIC},
  file = {Patil2010.pdf:Patil2010.pdf:PDF},
  owner = {kwg071000},
  pdf = {Patil2010.pdf},
  timestamp = {2010.02.26}
}
@inproceedings{Patil2008,
  title = {Detection of speech under physical stress: Model development, sensor selection, and feature fusion},
  author = {Sanjay A. Patil and John H. L. Hansen},
  booktitle = {Proc. Interspeech},
  year = {2008},
  abstract = {Speech system scenarios can require the user to perform tasks which exert limitations on his speech production/physiology thereby causing speaker variability and reduced speech system performance. This is speech under stress, which represents a speech different from speech under neutral conditions. The stress can be physical, cognitive or noise induced (Lombard). In this study, the focus is on physical stress, with specific emphasis on: (i) number of speakers used for modeling, (ii) alternative audio sensors, and (iii) fusion based stress detection using a new audio corpus (UT-Scope). We used a GMM framework with our previously formulated TEO-CB-AutoEnv features for neutral/physical stress detection. Second, stress detection performance is investigated for both acoustic and non-acoustic (P-MIC) sensors. Evaluations show that effective stress models can be obtained with 12 speakers out of a random size of 1-42 subjects, with stress detection performance of 62.96% (for closetalking mic) and 66.36% (for P-MIC) respectively. The TEOCB- AutoEnv model scores were fused with traditional MFCC based stress model scores using the Adaboost algorithm, resulting in an improvement in overall system performance of 9.43% (absolute, for close-talking mic) and 12.99% (absolute, for PMIC) respectively. These three advances allow for effective stress detection algorithm development with fewer training speakers and/or alternative sensors in combined feature domains. IStress Detection, TEO-CB-AutoEnv, P-MIC, Speaker Variability.},
  file = {Patil2008.pdf:Patil2008.pdf:PDF},
  owner = {kwg071000},
  pdf = {Patil2008.pdf},
  timestamp = {2009.07.24}
}
@inproceedings{Patil2010a,
  title = {Speech under physical stress: A production-based framework},
  author = {Sanjay A. Patil and Abhijeet Sangwan and John H. L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {2010},
  pages = {5146-5149},
  abstract = {This paper examines the impact of physical stress on speech. The methodology adopted here identifies inter-utterance breathing (IUB) patterns as a key intermediate variable while studying the relationship between physical stress and speech. Additionally, this work connects high-level prosodic changes in the speech signal (energy, pitch, and duration) to the corresponding breathing patterns. Our results demonstrate the diversity of breathing and articulation patterns that speakers employ in order to compensate for the increased body oxygen demand. Here, we identify the normalized value of breathing energy rate (proportional to minute volume) acquired from a conventional as well as physiological microphone as a reliable and accurate estimator of physical stress. Additionally, we also show that the prosodic patterns (pitch, energy, and duration) of high-level speech structure shows good correlation with the normalized-breathing energy rate. In this manner, the study establishes the interconnection between temporal speech structure and physical stress through breathing.},
  file = {Patil2010a.pdf:Patil2010a.pdf:PDF},
  owner = {kwg071000},
  pdf = {Patil2010a.pdf},
  timestamp = {2012.03.21}
}
@inproceedings{Pellom1999,
  title = {An experimental study of speaker verification sensitivity to computer voice-altered imposters},
  author = {Bryan L. Pellom and John H. L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {1999},
  abstract = {This paper investigates the relative sensitivity of a Gaussian mixture model (GMM) based voice verification algorithm to computer voice-altered imposters. First, a new trainable speech synthesis algorithm based on trajectory models of the speech line spectral frequency (LSF) parameters is presented in order to model the spectral characteristics of a target voice. A GMM based speaker verifier is then constructed for the 138 speaker YOHO database and shown to have an initial equal-error rate (EER) of 1.45% for the case of casual imposter attempts using a single combination-lock phrase test. Next, imposter voices are automatically altered using the synthesis algorithm to mimic the customer's voice. After voice transformation, the false acceptance rate is shown to increase from 1.45% to over 86% if the baseline EER threshold is left unmodified. Furthermore, at a customer false rejection rate of 25%, the false acceptance rate for the voice-altered imposter remains as high as 34.6%.},
  file = {Pellom1999.pdf:Pellom1999.pdf:PDF},
  owner = {kwg071000},
  pdf = {Pellom1999.pdf},
  timestamp = {2009.06.05}
}
@article{Pellom1998,
  title = {An improved {(Auto:I, LSP:T)} constrained iterative speech enhancement for colored noise environments},
  author = {Bryan L. Pellom and John H. L. Hansen},
  journal = {IEEE Trans. Speech Audio Process.},
  year = {1998},
  number = {6},
  pages = {573-579},
  volume = {6},
  abstract = {In this correspondence we illustrate how the (Auto:I, LSP:T) constrained iterative speech enhancement algorithm can be extended to provide improved performance in colored noise environments. The modified algorithm, referred to here as noise adaptive (Auto:I, LSP:T), operates on subbanded signal components in which the terminating iteration is adjusted based on the a posteriori estimate of the signal-to-noise ratio (SNR) in each signal subband. The enhanced speech is formulated as a combined estimate from individual signal subband estimators. The algorithm is shown to improve objective speech quality in additive noise environments over the traditional constrained iterative (Auto:I, LSP:T) enhancement formulation.},
  file = {Pellom1998.pdf:Pellom1998.pdf:PDF},
  owner = {kwg071000},
  pdf = {Pellom1998.pdf},
  timestamp = {2011.04.15}
}
@inproceedings{Rahurkar2002,
  title = {Frequency Band Analysis for Stress Detection Using a Teager Energy Operator Based Feature},
  author = {Mandar A. Rahurkar and John H.L Hansen and James Meyerhoff and George Saviolakis and Michael Koenig},
  booktitle = {Proc. ICSLP},
  year = {2002},
  pages = {2021-2024},
  abstract = {Studies have shown that the performance of speech recognition algorithms severely degrade due to the presence of task and emotional induced stress in adverse conditions. This paper addresses the problem of detecting the presence of stress in speech by analyzing nonlinear feature characteristics in specific frequency bands. The framework of the previously derived Teager Energy Operator(TEO) based feature TEO-CB-AutoEnv is used. A new detection scheme is proposed based on weighted TEO features derived from critical bands frequencies. The new detection framework is evaluated on a military speech corpus collected in a Soldier of the Quarter (SOQ) paradigm. Heart rate and blood pressure measurements confirm subjects were under stress. Using the traditional TEO-CB-AutoEnv feature with an HMM trained stressed speech classifier, we show error rates of 22.5% and 13% for stress and neutral speech detection. With the new weighted sub-band detection scheme, detection error rates are reduced to 4.7% and 4.6% for stress and neutral detection, a relative error reduction of 79.1% and 64.6% respectively. Finally we discuss issues related to generation of stress anchor models and speaker dependency.},
  file = {Rahurkar2002.pdf:Rahurkar2002.pdf:PDF},
  owner = {kwg071000},
  pdf = {Rahurkar2002.pdf},
  timestamp = {2012.03.06}
}
@conference{2016_ICASSP_LID_DNN,
  title = {LANGUAGE RECOGNITION USING DEEP NEURAL NETWORKS WITH VERY LIMITED TRAINING DATA},
  author = {S. Ranjan and C. yu and C. Zhang and F. Kelly and John H. L. Hansen},
  booktitle = {Proc. IEEE ICASSP-2016},
  year = {2016},
  owner = {axm101521},
  pdf = {2016_ICASSP_LID_DNN.pdf},
  timestamp = {2016.04.20}
}
@inproceedings{Ruzanski2005,
  title = {Effects of phoneme characteristics on TEO feature-based automatic stress detection in speech},
  author = {Evan Ruzanski and John H. L. Hansen and James Meyerhoff and George Saviolakis and Michael Koenig},
  booktitle = {Proc. IEEE ICASSP},
  year = {2005},
  abstract = {A major challenge of automatic speech recognition systems found in many areas of today's society is the ability to overcome natural phoneme conditions that potentially degrade performance. In this study, we discuss the effects of two critical phoneme characteristics, decreased vowel duration and mismatched vowel type, on the performance of automatic stress detection in speech using Teager Energy Operator features. We determine the scope and magnitude of these effects on stress detection performance and propose an algorithm to compensate for vowel type and duration shortening on stress detection performance using a composite phoneme decision scheme, which results in relative error reductions of 24% and 39% in the non-stress and stress conditions, respectively.},
  file = {Ruzanski2005.pdf:Ruzanski2005.pdf:PDF},
  owner = {kwg071000},
  pdf = {Ruzanski2005.pdf},
  timestamp = {2009.08.31}
}
@article{sadjadi2013combo,
  title = {Unsupervised speech activity detection using voicing measures and perceptual spectral flux},
  author = {Seyed Omid Sadjadi and John H.L. Hansen},
  journal = {IEEE Signal Processing Letters},
  year = {2013},
  pages = {197-200},
  volume = {20},
  file = {:sadjadi2013combo.pdf:PDF},
  owner = {axm101521},
  pdf = {sadjadi2013combo.pdf},
  timestamp = {2013.09.12}
}
@inproceedings{sadjadi2013front,
  title = {Robust front-end processing for speaker identification over extremely degraded communication channels},
  author = {Seyed Omid Sadjadi and John H.L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {2013},
  file = {:sadjadi2013front.pdf:PDF},
  owner = {axm101521},
  pdf = {sadjadi2013front.pdf},
  timestamp = {2013.09.12}
}
@inproceedings{sadjadi2012bsw,
  title = {Blind Reverberation mitigation for robust speaker identification},
  author = {Seyed Omid Sadjadi and John H.L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {2012},
  file = {:sadjadi2012bsw.pdf:PDF},
  owner = {axm101521},
  pdf = {sadjadi2012bsw.pdf},
  timestamp = {2013.09.12}
}
@inproceedings{sadjadi2011mhec,
  title = {HIlbert Envelope based features for robust speaker identification under reverberant mismatched conditions},
  author = {Seyed Omid Sadjadi and John H.L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {2011},
  file = {:sadjadi2011mhec.pdf:PDF},
  owner = {axm101521},
  pdf = {sadjadi2011mhec.pdf},
  timestamp = {2013.09.12}
}
@inproceedings{Sadjadi2010,
  title = {Assessment of Single-Channel Speech Enhancement Techniques for Speaker Identification under Mismatched Conditions},
  author = {Seyed Omid Sadjadi and John H. L. Hansen},
  booktitle = {Proc. Interspeech},
  year = {2010},
  pages = {2138-2141},
  abstract = {It is well known that MFCC based speaker identification (SID) systems easily break down under mismatched training and test conditions. In this paper, we report on a study that considers four different single-channel speech enhancement front-ends for robust SID under such conditions. Speech files from the YOHO database are corrupted with four types of noise including babble, car, factory, and white Gaussian at five SNR levels (0-20 dB), and processed using four speech enhancement techniques representing distinct classes of algorithms: spectral subtraction, statistical model-based, subspace, and Wiener filtering. Both processed and unprocessed files are submitted to a SID system trained on clean data. In addition, a new set of acoustic feature parameters based on Hilbert envelope of gammatone filterbank outputs are proposed and evaluated for SID task. Experimental results indicate that: (i) depending on the noise type and SNR level, the enhancement front-ends may help or hurt SID performance, (ii) the proposed feature significantly achieves higher SID accuracy compared to MFCCs under mismatched conditions.},
  file = {Sadjadi2010.pdf:Sadjadi2010.pdf:PDF},
  owner = {kwg071000},
  pdf = {Sadjadi2010.pdf},
  timestamp = {2011.09.26}
}
@inproceedings{Sadjadi2012,
  title = {Mean Hilbert Envelope Coefficients ({MHEC}) for Robust Speaker Recognition},
  author = {Seyed Omid Sadjadi and Taufiq Hasan and John H.L. Hansen},
  booktitle = {Proc. Interspeech},
  year = {2012},
  abstract = {The recently introduced mean Hilbert envelope coefficients (MHEC) have been shown to be an effective alternative to MFCCs for robust speaker identification under noisy and reverberant conditions in relatively small tasks. In this study, we investigate the effectiveness of these acoustic features in the context of a state-of-the-art speaker recognition system. The i-vectors are used to represent the acoustic space of speakers, while modeling is performed via probabilistic linear discriminant analysis (PLDA). We report speaker verification performance on the NIST SRE-2010 extended telephone and microphone trials for both female and male genders. Experimental results confirm consistent superiority of MHECs to traditional MFCCs within i-vector speaker verification, particularly under microphone and telephone training-test mismatch conditions. In addition, fusion of subsystems trained with the individual front-ends proves that the two acoustic features (i.e., MHEC and MFCC) provide complimentary information for recognizing speakers. Index Terms: Mean Hilbert Envelope Coefficients (MHEC), mismatch conditions, NIST SRE, speaker recognition},
  file = {Sadjadi2012.pdf:Sadjadi2012.pdf:PDF},
  owner = {kwg071000},
  pdf = {Sadjadi2012.pdf},
  timestamp = {2012.09.24}
}
@conference{CP-ICASSP15-Lombard-MS_GL_JH-0004350,
  title = {Weighted Training for Speech Under Lombard Effect for Speaker Recognition},
  author = {M.M. Saleem and G. Liu and John H. L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {2015},
  file = {:CP-ICASSP15-Lombard-MS_GL_JH-0004350.pdf:PDF},
  owner = {axm101521},
  pdf = {CP-ICASSP15-Lombard-MS_GL_JH-0004350.pdf},
  timestamp = {2015.05.03}
}
@inproceedings{Sarikaya1998,
  title = {Wavelet Packet Transfrom Features with Application to Speaker Identification},
  author = {Ruhi Sarikaya and Bryan L. Pellom and John H. L. Hansen},
  booktitle = {Nordic Signal Processing Symposium},
  year = {1998},
  abstract = {This study proposes a new set of feature parameters based on wavelet packet transform analysis of the speech signal. The new speech features are named subband based cepstral parameers (SBC) and wavelet packet parameters (WPP). The ability of each parameter set to capture speaker identity conveyed in the speech signal is compared to the widely used Mel-frequency cepstral coefficients (MFCC). The proposed parameterization methods are shown to achieve 48% and 67% reduction in relative error over MFCC for 630 and 168 speakers, respectively, using the TIMIT (downsampled to 8kHz) database.},
  file = {Sarikaya1998.pdf:Sarikaya1998.pdf:PDF},
  owner = {kwg071000},
  pdf = {Sarikaya1998.pdf},
  timestamp = {2009.06.29}
}
@conference{CP-ICASSP15-OverlapSpeechPLL-NS_AZ_JH-0004724,
  title = {Robust Overlapped Speech Detection and its Application in Word-Count Estimation for Prof-Life-Log Data},
  author = {N. Shokouhi and A. Ziaei and A. Sangwan and John H.L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {2015},
  file = {:CP-ICASSP15-OverlapSpeechPLL-NS_AZ_JH-0004724.pdf:PDF},
  owner = {axm101521},
  pdf = {CP-ICASSP15-OverlapSpeechPLL-NS_AZ_JH-0004724.pdf},
  timestamp = {2015.05.03}
}
@inproceedings{Varadarajan2006,
  title = {Analysis of Lombard effect under different types and levels of noise with application to In-set Speaker ID systems},
  author = {Vaishnevi Varadarajan and John H. L. Hansen},
  booktitle = {Proc. Interspeech},
  year = {2006},
  month = {September},
  organization = {ISCA},
  pages = {937-940},
  abstract = {This paper presents an analysis of Lombard speech produced under different types and levels of noise. The speech used for the analysis forms a part of the UT-SCOPE database and consists of sentences from the well-known TIMIT corpus, spoken in the presence of highway, large crowd and pink noise. Differences are shown to exist in the speech characteristics under these varying noise types. The deterioration of the EER of an in-set speaker identification system trained on neutral and tested with Lombard speech is also illustrated. A clear demarcation between the effect of noise and Lombard effect on noise is also given by testing with noisy Lombard speech. The effect of test-token duration on system performance under the Lombard condition is addressed. It is seen that test duration has no effect on the EER under Lombard effect. The average EER for 3s test duration is 14.7, 28.3, 48.2, 51.3 for neutral clean, clean Lombard, noisy neutral and noisy Lombard respectively, and 7.2, 26.4, 45.8, 50.8 respectively for 12s test duration.},
  file = {Varadarajan2006.pdf:Varadarajan2006.pdf:PDF},
  owner = {kwg071000},
  pdf = {Varadarajan2006.pdf},
  timestamp = {2009.06.29}
}
@inproceedings{Varadarajan2006a,
  title = {{UT-Scope} - A corpus for speech under cognitive/physical task stress and emotion},
  author = {Vaishnevi Varadarajan and John H. L. Hansen and Ayako Ikeno},
  booktitle = {Proc. LREC Workshop Speech Under Emotion},
  year = {2006},
  month = {May},
  owner = {kwg071000},
  timestamp = {2008.03.13}
}
@techreport{Vloeberghs2000,
  title = {The Impact of Speech Under Stress on Military Speech Technology},
  author = {Claude Vloeberghs and Patrick Verlinde and Carl Swail and Herman Steeneken and David van Leeuwen and Isabel Trancoso and Allan South and Roger Moore and E. James Cupples and Timothy Anderson and John Hansen},
  institution = {NATO Research and Technology Organization},
  year = {2000},
  month = {March},
  number = {RTO-TR-10},
  file = {Steeneken2000.pdf:Steeneken2000.pdf:PDF},
  owner = {kwg071000},
  pdf = {Steeneken2000.pdf},
  timestamp = {2008.02.26}
}
@article{Womack1999,
  title = {N-Channel hidden markov models for combined stressed speech classification and recognition},
  author = {Brian D. Womack and John H. L. Hansen},
  journal = {IEEE Trans. Speech Audio Process.},
  year = {1999},
  month = {November},
  number = {6},
  pages = {668-677},
  volume = {7},
  abstract = {Robust speech recognition systems must address variations due to perceptually induced stress in order to maintain acceptable levels of performance in adverse conditions. One approach for addressing these variations is to utilize front-end stress classification to direct a stress dependent recognition algorithm which separately models each speech production domain. This study proposes a new approach which combines stress classification and speech recognition functions into one algorithm. This is accomplished by generalizing the one-dimensional (1-D) hidden Markov model to an N-channel hidden Markov model (N-channel HMM). Here, each stressed speech production style under consideration is allocated a dimension in the N-Channel HMM to model each perceptually induced stress condition. It is shown that this formulation better integrates perceptually induced stress effects for stress independent recognition. This is due to the sub-phoneme (state level) stress classification that is implicitly performed by the algorithm. The proposed N- channel stress independent HMM method is compared to a previously established one-channel stress dependent isolated word recognition system yielding a 73.8% reduction in error rate. In addition, an 82.7% reduction in error rate is observed compared to the common one-channel neutral trained recognition approach. Index Terms- Lombard effect, N-channel Markov model, speech recognition, stress classification.},
  file = {Womack1999.pdf:Womack1999.pdf:PDF},
  owner = {kwg071000},
  pdf = {Womack1999.pdf},
  timestamp = {2008.02.05}
}
@inproceedings{Womack1996,
  title = {Improved speech recognition via speaker stress directed classification},
  author = {Brian D. Womack and John H. L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {1996},
  address = {Atlanta},
  pages = {53-57},
  abstract = {Speech production variations due to perceptually induced stress contribute significantly to reduced speech processing performance. This study proposes an algorithm for estimation of the degree of perceptually induced stress. It is suggested that the resulting stress score could be integrated into speech processing algorithms to improve robustness in adverse conditions. First, results from a previous study motivate selection of a targeted set of speech features across phoneme and stress groups to improve stress classification performance. Analysis of articulatory, excitation, and cepstral based features is conducted using a previously established stressed speech database (SUSAS). Targeted feature sets are selected across ten stress conditions (including Apache helicopter, angry, clear, Lombard effect, loud, etc.). Next, an improved targeted feature stress classification system is developed and evaluated achieving rates of 91.01%. Finally, application of stress classification is incorporated into a stress directed speech recognition system. An improvement of +10.14% and +15.43% over conventionally trained neutral and multi-style trained recognizers is demonstrated using the new stress directed recognition system.},
  file = {Womack1996.pdf:Womack1996.pdf:PDF},
  owner = {kwg071000},
  pdf = {Womack1996.pdf},
  timestamp = {2009.07.15}
}
@article{Womack1996a,
  title = {Classification of speech under stress using target driven features},
  author = {Brian D. Womack and John H. L. Hansen},
  journal = {Speech Commun.},
  year = {1996},
  month = {November},
  number = {1-2},
  pages = {131-150},
  volume = {20},
  abstract = {Speech production variations due to perceptually induced stress contribute significantly to reduced speech processing performance. One approach for assessment of production variations due to stress is to formulate an objective classification of speaker stress based upon the acoustic speech signal. This study proposes an algorithm for estimation of the probability of perceptually induced stress. It is suggested that the resulting stress score could be integrated into robust speech processing algorithms to improve robustness in adverse conditions. First, results from a previous stress classification study are employed to motivate selection of a targeted set of speech features on a per phoneme and stress group level. Analysis of articulatory, excitation and cepstral based features is conducted using a previously established stressed speech database (Speech Under Simulated and Actual Stress (SUSAS)). Stress sensitive targeted feature sets are then selected across ten stress conditions (including Apache helicopter cockpit, Angry, Clear, Lombard effect. Loud, etc.) and incorporated into a new targeted neural network stress classifier. Second, the targeted feature stress classification system is then evaluated and shown to achieve closed speaker, open token classification rates of 91.0%. Finally, the proposed stress classification algorithm is incorporated into a stress directed speech recognition system, where separate hidden Markov model recognizers are trained for each stress condition. An improvement of +10.1% and +15.4% over conventionally trained neutral and multi-style trained recognizers is demonstrated using the new stress directed recognition approach.},
  file = {Womack1996a.pdf:Womack1996a.pdf:PDF},
  owner = {kwg071000},
  pdf = {Womack1996a.pdf},
  timestamp = {2008.03.11}
}
@inproceedings{Womack1995,
  title = {Stress Independent Robust {HMM} Speech Recognition using Neural Network Stress Classification},
  author = {Brian D. Womack and John H. L. Hansen},
  booktitle = {Proc. Eurospeech},
  year = {1995},
  month = {September},
  file = {Womack1995.pdf:Documents/Research/Reference/Womack1995.pdf:PDF},
  owner = {kwg071000},
  pdf = {Womack1995.pdf},
  timestamp = {2007.12.02}
}
@inproceedings{Yapanel2003,
  title = {A New Perspective on Feature Extraction for Robust In-Vehicle Speech Recognition},
  author = {Umit Yapanel and John H. L. Hansen},
  booktitle = {Proc. Eurospeech},
  year = {2003},
  file = {Yapanel2003.pdf:Yapanel2003.pdf:PDF},
  owner = {kwg071000},
  pdf = {Yapanel2003.pdf},
  timestamp = {2007.09.26}
}
@article{Yapanel2008,
  title = {A new perceptually motivated MVDR-based acoustic front-end (PMVDR) for robust automatic speech recognition},
  author = {Umit H. Yapanel and John H.L. Hansen},
  journal = {Speech Commun.},
  year = {2008},
  pages = {142-152},
  volume = {50},
  abstract = {Acoustic feature extraction from speech constitutes a fundamental component of automatic speech recognition (ASR) systems. In this paper, we propose a novel feature extraction algorithm, perceptual-MVDR (PMVDR), which computes cepstral coefficients from the speech signal. This new feature representation is shown to better model the speech spectrum compared to traditional feature extraction approaches. Experimental results for small (40-word digits) to medium (5k-word dictation) size vocabulary tasks show varying degree of consistent improvements across different experiments; however, the new front-end is most effective in noisy car environments. The PMVDR front-end uses the minimum variance distortionless response (MVDR) spectral estimator to represent the upper envelope of the speech signal. Unlike Mel frequency cepstral coefficients (MFCCs), the proposed front-end does not utilize a filterbank. The effectiveness of the PMVDR approach is demonstrated by comparing speech recognition accuracies with the traditional MFCC front-end and recently proposed PMCC front-end in both noise-free and real adverse environments. For speech recognition in noisy car environments, a 40-word vocabulary task, PMVDR front-end provides a 36% relative decrease in word error rate (WER) over the MFCC front-end. Under simulated speaker stress conditions, a 35-word vocabulary task, the PMVDR front-end yields a 27% relative decrease in the WER. For a noise-free dictation task, a 5k-word vocabulary task, again a relative 8% reduction in the WER is reported. Finally, a novel analysis technique is proposed to quantify noise robustness of an acoustic front-end. This analysis is conducted for the acoustic front-ends analyzed in the paper and results are presented.},
  file = {Yapanel2008.pdf:Yapanel2008.pdf:PDF},
  owner = {kwg071000},
  pdf = {Yapanel2008.pdf},
  timestamp = {2012.03.07}
}
@conference{2016_ICASSP_LRE_ivML,
  title = {UTD-CRSS STSREM FOR NIST 2015 LANGUAGE RECOGNITION I-VECTOR MACHINE LEARNING CHALLENGE},
  author = {C. Yu and C. Zhang and S. Ranjan and Q. Zhang and A. Misra and F. Kelly and John H L Hansen},
  booktitle = {Proc. IEEE ICASSP-2016},
  year = {2016},
  owner = {axm101521},
  pdf = {2016_ICASSP_LRE_ivML.pdf},
  timestamp = {2016.04.20}
}
@conference{CP-SLT-2014_OpenSetSID-QZ-JH-0000384,
  title = {Training Candidate Selection for Effective Rejection in Open-Set Language Identification},
  author = {Q. Zhang and John H.L. Hansen},
  booktitle = {Proc. SLT},
  year = {2014},
  file = {:CP-SLT-2014_OpenSetSID-QZ-JH-0000384.pdf:PDF},
  owner = {axm101521},
  pdf = {CP-SLT-2014_OpenSetSID-QZ-JH-0000384.pdf},
  timestamp = {2015.05.03}
}
@article{Zhou2001,
  title = {Nonlinear feature based classification of speech under stress},
  author = {Guojun Zhou and John H. L. Hansen and James F. Kaiser},
  journal = {IEEE Trans. Speech Audio Process.},
  year = {2001},
  month = {March},
  number = {3},
  pages = {201-216},
  volume = {9},
  abstract = {Studies have shown that variability introduced by stress or emotion can severely reduce speech recognition accuracy. Techniques for detecting or assessing the presence of stress could help improve the robustness of speech recognition systems. Although some acoustic variables derived from linear speech production theory have been investigated as indicators of stress, they are not always consistent. In this paper, three new features derived from the nonlinear Teager energy operator (TEO) are investigated for stress classification. It is believed that the TEO based features are better able to reflect the nonlinear airflow structure of speech production under adverse stressful conditions. The features proposed include TEO-decomposed FM variation (TEO-FM-Var), normalized TEO autocorrelation envelope area (TEO-Auto-Env), and critical band based TEO autocorrelation envelope area (TEO-CB-Auto-Env). The proposed features are evaluated for the task of stress classification using simulated and actual stressed speech and it is shown that the TEO-CB-Auto-Env feature outperforms traditional pitch and mel-frequency cepstrum coefficients (MFCC) substantially. Performance for TEO based features are maintained in both text-dependent and text-independent models, while performance of traditional features degrades in text-independent models. Overall neutral versus stress classification rates are also shown to be more consistent across different stress styles. Index Terms - Human factors, nonlinear speech feature, speech analysis, speech recognition, stress classification, Teager energy operator (TEO).},
  file = {Zhou2001.pdf:Zhou2001.pdf:PDF},
  owner = {kwg071000},
  pdf = {Zhou2001.pdf},
  timestamp = {2008.02.06}
}
@inproceedings{Zhou1998,
  title = {Linear and Nonlinear Speech Feature Analysis for Stress Classification},
  author = {Guojun Zhou and John H. L. Hansen and James F. Kaiser},
  booktitle = {Proc. ICSLP},
  year = {1998},
  file = {Zhou1998.pdf:Zhou1998.pdf:PDF},
  owner = {kwg071000},
  pdf = {Zhou1998.pdf},
  timestamp = {2009.04.17}
}
@conference{CP-ICASSP15-ProfLifeLog-AZ_AS_LK_JH-0004719,
  title = {Prof-Life-Log: Analysis and Classification of Activities in Daily Audio Streams},
  author = {A. Ziaei and A. Sangwan and L. Kaushik and John H. L. Hansen},
  booktitle = {Proc. IEEE ICASSP},
  year = {2015},
  file = {:CP-ICASSP15-ProfLifeLog-AZ_AS_LK_JH-0004719.pdf:PDF},
  owner = {axm101521},
  pdf = {CP-ICASSP15-ProfLifeLog-AZ_AS_LK_JH-0004719.pdf},
  timestamp = {2015.05.03}
}
@comment{{jabref-meta: selector_publisher:}}
@comment{{jabref-meta: selector_author:}}
@comment{{jabref-meta: selector_journal:}}
@comment{{jabref-meta: selector_keywords:}}

This file was generated by bibtex2html 1.97.