<?xml version="1.0" encoding="utf-8" standalone="no"?>
<dublin_core schema="dc">
<dcvalue element="contributor" qualifier="author">Cho,&#x20;Sungjae</dcvalue>
<dcvalue element="contributor" qualifier="author">Lee,&#x20;Soo-Young</dcvalue>
<dcvalue element="date" qualifier="accessioned">2024-01-12T04:08:33Z</dcvalue>
<dcvalue element="date" qualifier="available">2024-01-12T04:08:33Z</dcvalue>
<dcvalue element="date" qualifier="created">2022-10-04</dcvalue>
<dcvalue element="date" qualifier="issued">2021</dcvalue>
<dcvalue element="identifier" qualifier="issn">2308-457X</dcvalue>
<dcvalue element="identifier" qualifier="uri">https:&#x2F;&#x2F;pubs.kist.re.kr&#x2F;handle&#x2F;201004&#x2F;77777</dcvalue>
<dcvalue element="description" qualifier="abstract">We&#x20;present&#x20;a&#x20;methodology&#x20;to&#x20;train&#x20;our&#x20;multi-speaker&#x20;emotional&#x20;text-to-speech&#x20;synthesizer&#x20;that&#x20;can&#x20;express&#x20;speech&#x20;for&#x20;10&#x20;speakers&amp;apos;&#x20;7&#x20;different&#x20;emotions.&#x20;All&#x20;silences&#x20;from&#x20;audio&#x20;samples&#x20;are&#x20;removed&#x20;prior&#x20;to&#x20;learning.&#x20;This&#x20;results&#x20;in&#x20;fast&#x20;learning&#x20;by&#x20;our&#x20;model.&#x20;Curriculum&#x20;learning&#x20;is&#x20;applied&#x20;to&#x20;train&#x20;our&#x20;model&#x20;efficiently.&#x20;Our&#x20;model&#x20;is&#x20;first&#x20;trained&#x20;with&#x20;a&#x20;large&#x20;single-speaker&#x20;neutral&#x20;dataset,&#x20;and&#x20;then&#x20;trained&#x20;with&#x20;neutral&#x20;speech&#x20;from&#x20;all&#x20;speakers.&#x20;Finally,&#x20;our&#x20;model&#x20;is&#x20;trained&#x20;using&#x20;datasets&#x20;of&#x20;emotional&#x20;speech&#x20;from&#x20;all&#x20;speakers.&#x20;In&#x20;each&#x20;stage,&#x20;training&#x20;samples&#x20;of&#x20;each&#x20;speaker-emotion&#x20;pair&#x20;have&#x20;equal&#x20;probability&#x20;to&#x20;appear&#x20;in&#x20;mini-batches.&#x20;Through&#x20;this&#x20;procedure,&#x20;our&#x20;model&#x20;can&#x20;synthesize&#x20;speech&#x20;for&#x20;all&#x20;targeted&#x20;speakers&#x20;and&#x20;emotions.&#x20;Our&#x20;synthesized&#x20;audio&#x20;sets&#x20;are&#x20;available&#x20;on&#x20;our&#x20;web&#x20;page.</dcvalue>
<dcvalue element="language" qualifier="none">English</dcvalue>
<dcvalue element="publisher" qualifier="none">ISCA-INT&#x20;SPEECH&#x20;COMMUNICATION&#x20;ASSOC</dcvalue>
<dcvalue element="title" qualifier="none">Multi-speaker&#x20;Emotional&#x20;Text-to-speech&#x20;Synthesizer</dcvalue>
<dcvalue element="type" qualifier="none">Conference</dcvalue>
<dcvalue element="identifier" qualifier="doi">10.48550&#x2F;arXiv.2112.03557</dcvalue>
<dcvalue element="description" qualifier="journalClass">1</dcvalue>
<dcvalue element="identifier" qualifier="bibliographicCitation">Interspeech&#x20;Conference,&#x20;pp.2337&#x20;-&#x20;2338</dcvalue>
<dcvalue element="citation" qualifier="title">Interspeech&#x20;Conference</dcvalue>
<dcvalue element="citation" qualifier="startPage">2337</dcvalue>
<dcvalue element="citation" qualifier="endPage">2338</dcvalue>
<dcvalue element="citation" qualifier="conferencePlace">FR</dcvalue>
<dcvalue element="citation" qualifier="conferencePlace">Brno,&#x20;CZECH&#x20;REPUBLIC</dcvalue>
<dcvalue element="citation" qualifier="conferenceDate">2021-08-30</dcvalue>
<dcvalue element="relation" qualifier="isPartOf">INTERSPEECH&#x20;2021</dcvalue>
<dcvalue element="identifier" qualifier="wosid">000841879502086</dcvalue>
</dublin_core>
