publications | Chung-Ming Chien (簡仲明)

2026

TMLR 2026

On The Landscape of Spoken Language Models: A Comprehensive Survey

Chung-Ming Chien*, Siddhant Arora*, Kai-Wei Chang*, and 7 more authors

*equal contribution

Transactions on Machine Learning Research

@article{arora2025on,
  title = {On The Landscape of Spoken Language Models: A Comprehensive Survey},
  author = {Chien*, Chung-Ming and Arora*, Siddhant and Chang*, Kai-Wei and Peng*, Yifan and Wu*, Haibin and Adi, Yossi and Dupoux, Emmanuel and Lee, Hung-Yi and Livescu, Karen and Watanabe, Shinji},
  journal = {Transactions on Machine Learning Research},
  volume = {},
  pages = {},
  year = {2026},
  month = jan,
  issn = {},
  doi = {},
  eprint = {2504.08528},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
}

2025

ICLR 2025

Dynamic-SUPERB Phase-2: A Collaboratively Expanding Benchmark for Measuring the Capabilities of Spoken Language Models with 180 Tasks

Chien-yu Huang, Wei-Chih Chen, Shu-wen Yang, and 73 more authors

In ICLR 2025

arXiv Bib

@inproceedings{huang2025dynamicsuperb,
  title = {Dynamic-{SUPERB} Phase-2: A Collaboratively Expanding Benchmark for Measuring the Capabilities of Spoken Language Models with 180 Tasks},
  author = {Huang, Chien-yu and Chen, Wei-Chih and Yang, Shu-wen and Liu, Andy T. and Li, Chen-An and Lin, Yu-Xiang and Tseng, Wei-Cheng and Diwan, Anuj and Shih, Yi-Jen and Shi, Jiatong and Chen, William and Chen, Xuanjun and Hsiao, Chi-Yuan and Peng, Puyuan and Wang, Shih-Heng and Kuan, Chun-Yi and Lu, Ke-Han and Chang, Kai-Wei and Yang, Chih-Kai and Gutierrez, Fabian Alejandro Ritter and Kuan-Po, Huang and Arora, Siddhant and Lin, You-Kuan and To, CHUANG Ming and Yeo, Eunjung and Chang, Kalvin and Chien, Chung-Ming and Choi, Kwanghee and Hsieh, Cheng-Hsiu and Lin, Yi-Cheng and Yu, Chee-En and Chiu, I-Hsiang and Guimar{\~a}es, Heitor and Han, Jionghao and Lin, Tzu-Quan and Lin, Tzu-Yuan and Chang, Homu and Chang, Ting-Wu and Chen, Chun Wei and Chen, Shou-Jen and Chen, Yu-Hua and Cheng, Hsi-Chun and Dhawan, Kunal and Fang, Jia-Lin and Fang, Shi-Xin and CHIANG, KUAN YU FANG and Fu, Chi An and Hsiao, Hsien-Fu and Hsu, Ching Yu and Huang, Shao-Syuan and Wei, Lee Chen and Lin, Hsi-Che and Lin, Hsuan-Hao and Lin, Hsuan-Ting and Lin, Jian-Ren and Liu, Ting-Chun and Lu, Li-Chun and Pai, Tsung-Min and Pasad, Ankita and Kuan, Shih-Yun Shan and Shon, Suwon and Tang, Yuxun and Tsai, Yun-Shao and Chiang, Wei Jui and Wei, Tzu-Chieh and Wu, Chengxi and Wu, Dien-Ruei and Yang, Chao-Han Huck and Yang, Chieh-Chi and Yip, Jia Qi and Yuan, Shao-Xiang and Wu, Haibin and Livescu, Karen and Harwath, David and Watanabe, Shinji and Lee, Hung-yi},
  year = {2025},
  booktitle = {ICLR 2025},
  month = apr,
  eprint = {2411.05361},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
}

2024

InterSpeech 2024

Learning Fine-Grained Controllability on Speech Generation via Efficient Fine-Tuning

Chung-Ming Chien, Andros Tjandra, Apoorv Vyas, and 3 more authors

In Interspeech 2024

arXiv Bib Poster

@inproceedings{chien2024learning,
  title = {Learning Fine-Grained Controllability on Speech Generation via Efficient Fine-Tuning},
  author = {Chien, Chung-Ming and Tjandra, Andros and Vyas, Apoorv and Le, Matt and Shi, Bowen and Hsu, Wei-Ning},
  year = {2024},
  booktitle = {Interspeech 2024},
  month = sep,
  eprint = {2406.06251},
  archiveprefix = {arXiv},
  primaryclass = {eess.AS},
}

ACL 2024

On the Evaluation of Speech Foundation Models for Spoken Language Understanding

Siddhant Arora, Ankita Pasad, Chung-Ming Chien, and 9 more authors

In Findings of the Association for Computational Linguistics ACL 2024

arXiv Bib

@inproceedings{arora2024on,
  title = {On the Evaluation of Speech Foundation Models for Spoken Language Understanding},
  author = {Arora, Siddhant and Pasad, Ankita and Chien, Chung-Ming and Han, Jionghao and Sharma, Roshan and Jung, Jee-weon and Dhamyal, Hira and Chen, William and Shon, Suwon and Lee, Hung-yi and Livescu, Karen and Watanabe, Shinji},
  booktitle = {Findings of the Association for Computational Linguistics ACL 2024},
  year = {2024},
  month = aug,
  eprint = {2406.10083},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
}

ICASSP 2024

AV2Wav: Diffusion-Based Re-synthesis from Continuous Self-supervised Features for Audio-Visual Speech Enhancement

Ju-Chieh Chou, Chung-Ming Chien, and Karen Livescu

In ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)

arXiv Bib

@inproceedings{chou2023av2wav,
  title = {AV2Wav: Diffusion-Based Re-synthesis from Continuous Self-supervised Features for Audio-Visual Speech Enhancement},
  author = {Chou, Ju-Chieh and Chien, Chung-Ming and Livescu, Karen},
  booktitle = {ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  year = {2024},
  month = apr,
  eprint = {2309.08030},
  archiveprefix = {arXiv},
  primaryclass = {eess.AS},
}

TACL 2024

What Do Self-Supervised Speech Models Know about Words?

Ankita Pasad, Chung-Ming Chien, Shane Settle, and 1 more author

Transactions of the Association for Computational Linguistics

arXiv Bib Code

@article{pasad2023what,
  title = {What Do Self-Supervised Speech Models Know about Words?},
  author = {Pasad, Ankita and Chien, Chung-Ming and Settle, Shane and Livescu, Karen},
  journal = {Transactions of the Association for Computational Linguistics},
  volume = {12},
  pages = {372-391},
  year = {2024},
  month = apr,
  issn = {2307-387X},
  doi = {10.1162/tacl_a_00656},
  eprint = {2307.00162},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
}

2023

ASRU 2023

Few-Shot Spoken Language Understanding via Joint Speech-Text Models

Chung-Ming Chien, Mingjiamei Zhang, Ju-Chieh Chou, and 1 more author

Best Student Paper Award

In 2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)

arXiv Bib Poster Slides Video

@inproceedings{chien2023few,
  title = {Few-Shot Spoken Language Understanding via Joint Speech-Text Models},
  author = {Chien, Chung-Ming and Zhang, Mingjiamei and Chou, Ju-Chieh and Livescu, Karen},
  booktitle = {2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)},
  year = {2023},
  month = dec,
  eprint = {2310.05919},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
}

EMNLP 2023

Toward Joint Language Modeling for Speech Units and Text

Ju-Chieh Chou, Chung-Ming Chien, Wei-Ning Hsu, and 5 more authors

In Findings of the Association for Computational Linguistics: EMNLP 2023

arXiv Bib

@inproceedings{chou2023toward,
  title = {Toward Joint Language Modeling for Speech Units and Text},
  author = {Chou, Ju-Chieh and Chien, Chung-Ming and Hsu, Wei-Ning and Livescu, Karen and Babu, Arun and Conneau, Alexis and Baevski, Alexei and Auli, Michael},
  booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2023},
  year = {2023},
  pages = {6582--6593},
  doi = {10.18653/v1/2023.findings-emnlp.438"},
  month = dec,
  url = {https://aclanthology.org/2023.findings-emnlp.438},
  eprint = {2310.08715},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
}

2022

ICASSP 2022

Voice Filter: Few-Shot Text-to-Speech Speaker Adaptation Using Voice Conversion as a Post-Processing Module

Adam Gabryś, Goeric Huybrechts, Manuel Sam Ribeiro, and 6 more authors

In ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)

arXiv Bib

@inproceedings{gabrys2022voice,
  author = {Gabry\'s, Adam and Huybrechts, Goeric and Ribeiro, Manuel Sam and Chien, Chung-Ming and Roth, Julian and Comini, Giulia and Barra-Chicote, Roberto and Perz, Bartek and Lorenzo-Trueba, Jaime},
  booktitle = {ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  title = {Voice Filter: Few-Shot Text-to-Speech Speaker Adaptation Using Voice Conversion as a Post-Processing Module},
  year = {2022},
  volume = {},
  number = {},
  pages = {7902-7906},
  doi = {10.1109/ICASSP43922.2022.9747239},
  month = may,
  eprint = {2202.08164},
  archiveprefix = {arXiv},
  primaryclass = {eess.AS},
}

2021

InterSpeech 2021

S2VC: A Framework for Any-to-Any Voice Conversion with Self-Supervised Pretrained Representations

Jheng-hao Lin, Yist Y. Lin, Chung-Ming Chien, and 1 more author

In Proc. Interspeech 2021

arXiv Bib Code

@inproceedings{lin2021s2vc,
  author = {Lin, Jheng-hao and Lin, Yist Y. and Chien, Chung-Ming and Lee, Hung-yi},
  title = {S2VC: A Framework for Any-to-Any Voice Conversion with Self-Supervised Pretrained Representations},
  year = {2021},
  booktitle = {Proc. Interspeech 2021},
  pages = {836--840},
  doi = {10.21437/Interspeech.2021-1356},
  month = aug,
  eprint = {2104.02901},
  archiveprefix = {arXiv},
  primaryclass = {eess.AS},
}

ICASSP 2021

Investigating on Incorporating Pretrained and Learnable Speaker Representations for Multi-Speaker Multi-Style Text-to-Speech

Chung-Ming Chien, Jheng-Hao Lin, Chien-yu Huang, and 2 more authors

In ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)

arXiv Bib Code Poster Slides

@inproceedings{chien2021investigating,
  author = {Chien, Chung-Ming and Lin, Jheng-Hao and Huang, Chien-yu and Hsu, Po-chun and Lee, Hung-yi},
  booktitle = {ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  title = {Investigating on Incorporating Pretrained and Learnable Speaker Representations for Multi-Speaker Multi-Style Text-to-Speech},
  year = {2021},
  volume = {},
  number = {},
  pages = {8588-8592},
  doi = {10.1109/ICASSP39728.2021.9413880},
  month = jun,
  eprint = {2103.04088},
  archiveprefix = {arXiv},
  primaryclass = {eess.AS},
}

ICASSP 2021

FragmentVC: Any-To-Any Voice Conversion by End-To-End Extracting and Fusing Fine-Grained Voice Fragments with Attention

Chung-Ming Chien*, Yist Y. Lin*, Jheng-Hao Lin, and 2 more authors

*equal contribution

In ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)

arXiv Bib Code Poster Slides

@inproceedings{chien2020fragmentvc,
  author = {Chien*, Chung-Ming and Lin*, Yist Y. and Lin, Jheng-Hao and Lee, Hung-yi and Lee, Lin-shan},
  booktitle = {ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  title = {FragmentVC: Any-To-Any Voice Conversion by End-To-End Extracting and Fusing Fine-Grained Voice Fragments with Attention},
  year = {2021},
  volume = {},
  number = {},
  pages = {5939-5943},
  doi = {10.1109/ICASSP39728.2021.9413699},
  month = jun,
  eprint = {2010.14150},
  archiveprefix = {arXiv},
  primaryclass = {eess.AS},
}

SLT 2021

Hierarchical Prosody Modeling for Non-Autoregressive Speech Synthesis

Chung-Ming Chien, and Hung-yi Lee

In 2021 IEEE Spoken Language Technology Workshop (SLT)

arXiv Bib Slides

@inproceedings{chien2020hierarchical,
  author = {Chien, Chung-Ming and Lee, Hung-yi},
  booktitle = {2021 IEEE Spoken Language Technology Workshop (SLT)},
  title = {Hierarchical Prosody Modeling for Non-Autoregressive Speech Synthesis},
  year = {2021},
  volume = {},
  number = {},
  pages = {446-453},
  doi = {10.1109/SLT48900.2021.9383629},
  month = jan,
  eprint = {2011.06465},
  archiveprefix = {arXiv},
  primaryclass = {eess.AS},
}