-
Notifications
You must be signed in to change notification settings - Fork 27
Expand file tree
/
Copy pathindex.html
More file actions
540 lines (490 loc) · 25 KB
/
Copy pathindex.html
File metadata and controls
540 lines (490 loc) · 25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
<!DOCTYPE html>
<html lang="en">
<head>
<!-- Bulma CSS -->
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css">
<!-- Font Awesome Icons -->
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.0/css/all.min.css">
<!-- AI Icon (for arXiv) -->
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/tmnavidz/arxiv-icon@latest/css/ai.min.css">
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>UniTok-Audio: A Unified Audio Generation Framework</title>
<style>
:root {
--primary-color: #2c3e50;
--secondary-color: #3498db;
--accent-color: #e74c3c;
--bg-light: #f8f9fa;
--text-dark: #2c3e50;
--text-gray: #7f8c8d;
--border: #e0e0e0;
--shadow: 0 0 20px rgba(0, 0, 0, 0.1);
}
body {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
line-height: 1.7;
color: var(--text-dark);
background-color: #f5f5f5;
margin: 0;
padding: 0;
}
.container {
max-width: 1200px;
margin: 20px auto;
padding: 30px;
background: white;
border-radius: 12px;
box-shadow: var(--shadow);
position: relative;
}
/* Title & Authors */
.title {
text-align: center;
font-size: 2.4em;
font-weight: 700;
color: var(--primary-color);
margin-bottom: 10px;
letter-spacing: -0.5px;
}
.links {
text-align: center;
margin: 15px 0;
font-size: 1.1em;
}
.links a {
margin: 0 16px;
padding: 8px 16px;
background: var(--secondary-color);
color: white !important;
text-decoration: none;
border-radius: 6px;
font-weight: 600;
transition: all 0.3s ease;
}
.links a:hover {
background: #2980b9;
transform: translateY(-2px);
}
.authors p {
margin: 8px 0;
font-size: 1.1em;
font-weight: 500;
color: var(--text-dark);
}
.institution {
font-size: 1em;
color: var(--text-gray);
text-align: center;
margin: 10px 0 30px;
}
/* Section */
.section {
margin-bottom: 50px;
}
.section-title {
font-size: 1.6em;
color: var(--primary-color);
margin-bottom: 25px;
padding-bottom: 10px;
border-bottom: 2px solid var(--border);
font-weight: 600;
display: inline-block;
}
.abstract-content {
padding: 25px;
background-color: var(--bg-light);
border-left: 4px solid var(--secondary-color);
border-radius: 8px;
font-size: 1.15em;
line-height: 1.8;
color: var(--text-dark);
}
/* Figures */
.figure-container {
display: flex;
flex-direction: column;
align-items: center;
gap: 15px;
margin: 20px 0;
}
.figure {
max-width: 100%;
height: auto;
border-radius: 8px;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
transition: transform 0.3s ease;
}
.figure:hover {
transform: scale(1.02);
}
.figure-caption {
font-size: 0.95em;
color: var(--text-gray);
text-align: center;
margin-top: 8px;
font-style: italic;
}
/* Audio Tables */
.audio-demos {
width: 100%;
overflow-x: auto;
margin-top: 15px;
}
.audio-table {
width: 100%;
border-collapse: collapse;
background: white;
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.08);
border-radius: 8px;
overflow: hidden;
}
.audio-table th {
background-color: #f1f3f5;
color: var(--primary-color);
font-weight: 600;
padding: 14px 12px;
text-align: center;
font-size: 1.1em;
border-bottom: 2px solid var(--border);
}
.audio-table td {
padding: 14px 12px;
text-align: center;
border-bottom: 1px solid var(--border);
vertical-align: middle;
}
.audio-table tr:hover {
background-color: #f8f9ff;
}
audio {
width: 250px;
height: 40px;
border-radius: 8px;
box-shadow: 0 1px 5px rgba(0, 0, 0, 0.1);
}
.audio-name {
font-weight: 600;
color: var(--text-dark);
text-align: left;
padding: 14px;
background: #fafafa;
}
/* Responsive */
@media (max-width: 768px) {
.container {
padding: 15px;
margin: 10px;
}
.title {
font-size: 2em;
}
.links a {
display: block;
margin: 10px auto;
width: fit-content;
}
audio {
width: 100%;
}
.audio-table {
font-size: 0.95em;
}
.abstract-content {
font-size: 1em;
padding: 18px;
}
}
</style>
</head>
<body>
<div class="container">
<section class="hero">
<div class="hero-body">
<div class="container is-max-desktop has-text-centered">
<!-- Title -->
<h1 class="title is-1 publication-title">
UniTok-Audio: A Unified Audio Generation Framework Via Universal Discrete Token
</h1>
<!-- Authors -->
<div class="is-size-5 publication-authors">
<span class="author-block">
<a href="#">Chengwei Liu†</a><sup>1</sup>,</span>
<span class="author-block">
<a href="#">Haoyin Yan†</a><sup>1,2</sup>,</span>
<span class="author-block">
<a href="#">Shaofei Xue*</a><sup>1,2</sup>,</span>
<span class="author-block">
<a href="#">Xiaotao Liang</a><sup>1</sup>,</span>
<span class="author-block">
<a href="#">Yinghao Liu</a><sup>1,2</sup>
<span class="author-block">
<a href="#">Zheng Xue</a><sup>1</sup>,</span>
<span class="author-block">
<a href="#">Boyang Zhou</a><sup>1,3</sup>,</span>
</span>
</div>
<!-- Institutions -->
<div class="is-size-5 publication-authors">
<span class="author-block"><sup>1</sup>Intelligent Connectivity, Alibaba Group</span>,
<span class="author-block"><sup>2</sup>Tongyi Lab, Alibaba Group</span>
<span class="author-block"><sup>3</sup>Zhejiang University</span>
</div>
<!-- Corresponding Note -->
<div class="is-size-6" style="margin-top: 6px;">
<em>†Equal Contribution: Haoyin Yan, Chengwei Liu</em>,
<em>*Corresponding author: Shaofei Xue</em>
</div>
<!-- Links -->
<div class="publication-links" style="margin-top: 18px;">
<span class="link-block">
<a href="https://arxiv.org/abs/2510.26372"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="ai ai-arxiv"></i>
</span>
<span>arXiv</span>
</a>
</span>
<span class="link-block">
<a href="https://github.com/alibaba/unified-audio"
class="external-link button is-normal is-rounded is-dark" target="_blank">
<span class="icon">
<i class="fab fa-github"></i>
</span>
<span>Code</span>
</a>
</span>
</div>
</div>
</div>
</section>
<!-- Abstract -->
<div class="section">
<h2 class="section-title">Abstract</h2>
<div class="abstract-content">
Generative modeling has recently achieved remarkable success across text, image, and audio domains, demonstrating powerful capabilities for unified representation learning. However, audio generation models still face challenges in terms of audio quality and generalization ability across tasks. To address these issues, we propose UniTok-Audio, a scalable and extensible framework for unified audio generation tasks. Specifically, 1) UniTok-Audio extracts continuous feature of conditions to generates discrete tokens of target auido in an autoregressive manner; 2) a special task identifier token unifies different learning patterns of multiple tasks in an unified framework; 3) a dual-stream audio codec involving acoustic and semantic branch is developed for high-fidelity waveform reconstruction. Experimental results demonstrate that UniTok-Audio achieves competitive performance in comparation with state-of-the-art task-specific or multi-task systems across five time-aligned tasks: speech restoration, target speaker extraction, speech separation, voice conversion, and language-queried audio source separation. To foster future research, we will open-source our codebase.
</div>
</div>
<!-- UniTok-Audio Architecture -->
<div class="section">
<h2 class="section-title">UniTok-Audio</h2>
<div class="figure-container">
<img class="figure" src="./UniTok-audio/Figure/UniTok_audio_05.png" alt="UniTok-Audio Architecture">
<p class="figure-caption">Fig. 1: Overall architecture of UniTok-Audio, where the H-Codec Encoder is only used to generate label tokens during training and excluded during inference.</p>
</div>
<p style="margin-top: 20px; text-align: justify;">
UniTok-Audio is a unified discrete-token-based audio generation framework that consists of four parts: (1) a WavLM or HuBERT audio encoder, (2) a CLAP text encoder, (3) a LLaMA-based language model, and (4) a novel H-Codec decoder. The WavLM and HuBERT encoders extract continuous speech features from audio. The LLaMA-based LM takes these features as input and predicts discrete speech tokens generated by H-Codec in an autoregressive manner. Finally, the H-Codec decoder reconstructs enhanced speech from predicted tokens.
Guided by task identifiers and timestep embeddings, the LM processes latent sequences using cross-attention to generate task-specific outputs.
</p>
</div>
<!-- H-Codec -->
<div class="section">
<h2 class="section-title">H-Codec</h2>
<div class="figure-container">
<img class="figure" src="./UniTok-audio/Figure/H-CODEC_ARC.jpg" alt="H-Codec Architecture">
<p class="figure-caption">Fig. 2: Overall architecture of H-Codec.</p>
</div>
<p style="margin-top: 20px; text-align: justify;">
H-Codec features two encoding streams: a self-supervised learning (SSL) stream and a waveform stream. The SSL stream captures semantic-rich information and injects it into the first-layer codec tokens via direct encoding from HuBERT/WavLM features. The waveform stream uses a proven DAC-like framework to encode and decode high-quality audio. Both streams are downsampled to achieve a low frame rate of 25 Hz.
During training, both streams are used to obtain target tokens. During inference, only the decoder is active to generate high-fidelity audio.
</p>
</div>
<div class="section">
<h2 class="section-title">H-Codec Result</h2>
<div class="figure-container">
<img class="figure" src="./UniTok-audio/Figure/H-Codec_result.png" alt="H-Codec Architecture">
<p class="figure-caption">Fig. 3: Speech Reconstruction and Semantic Performance.</p>
</div>
<p style="margin-top: 20px; text-align: justify;">
H-Codec achieves the best performance at a token rate of 50 for most metrics. Moreover, its UTMOS score closely matches that of the ground truth, indicating that the reconstructed audio faithfully preserves the original speech quality. We also observe that certain models exceed the ground truth in UTMOS when operating at low token rates. We suspect this occurs because, under limited token constraints, the decoder behaves partly as a generative model—yielding plausible speech output but the alignment with the input was less precise..
</p>
</div>
<!-- Speech Restoration (SR) -->
<div class="section">
<h2 class="section-title">Speech Restoration (SR)</h2>
<div class="audio-demos">
<table class="audio-table">
<thead>
<tr>
<th>Clean</th>
<th>Noisy</th>
<th>Enhanced</th>
</tr>
</thead>
<tbody>
<tr>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SR/clean/1.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SR/noisy/1.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SR/enhanced/1.wav" type="audio/wav"></audio></td>
</tr>
<tr>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SR/clean/2.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SR/noisy/2.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SR/enhanced/2.wav" type="audio/wav"></audio></td>
</tr>
<tr>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SR/clean/3.flac" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SR/noisy/3.flac" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SR/enhanced/3.wav" type="audio/wav"></audio></td>
</tr>
<tr>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SR/clean/4.flac" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SR/noisy/4.flac" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SR/enhanced/4.wav" type="audio/wav"></audio></td>
</tr>
</tbody>
</table>
</div>
</div>
<!-- Target Speech Extraction (TSE) -->
<div class="section">
<h2 class="section-title">Target Speech Extraction (TSE)</h2>
<div class="audio-demos">
<table class="audio-table">
<thead>
<tr>
<th>Target</th>
<th>Mixture</th>
<th>Enhanced</th>
</tr>
</thead>
<tbody>
<tr>
<td><audio controls><source src="./UniTok-audio/AudioSamples/TSE/clean/1.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/TSE/noisy/1.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/TSE/enhanced/1.wav" type="audio/wav"></audio></td>
</tr>
<tr>
<td><audio controls><source src="./UniTok-audio/AudioSamples/TSE/clean/2.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/TSE/noisy/2.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/TSE/enhanced/2.wav" type="audio/wav"></audio></td>
</tr>
<tr>
<td><audio controls><source src="./UniTok-audio/AudioSamples/TSE/clean/3.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/TSE/noisy/3.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/TSE/enhanced/3.wav" type="audio/wav"></audio></td>
</tr>
<tr>
<td><audio controls><source src="./UniTok-audio/AudioSamples/TSE/clean/4.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/TSE/noisy/4.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/TSE/enhanced/4.wav" type="audio/wav"></audio></td>
</tr>
</tbody>
</table>
</div>
</div>
<!-- Speech Separation (SS) -->
<div class="section">
<h2 class="section-title">Speech Separation (SS)</h2>
<div class="audio-demos">
<table class="audio-table">
<thead>
<tr>
<th>Mixture</th>
<th>Speaker 1</th>
<th>Speaker 2</th>
</tr>
</thead>
<tbody>
<tr>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SS/noisy/1.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SS/s1/1.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SS/s2/1.wav" type="audio/wav"></audio></td>
</tr>
<tr>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SS/noisy/2.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SS/s1/2.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SS/s2/2.wav" type="audio/wav"></audio></td>
</tr>
<tr>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SS/noisy/3.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SS/s1/3.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SS/s2/3.wav" type="audio/wav"></audio></td>
</tr>
<tr>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SS/noisy/4.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SS/s1/4.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/SS/s2/4.wav" type="audio/wav"></audio></td>
</tr>
</tbody>
</table>
</div>
</div>
<!-- Voice Conversion (VC) -->
<div class="section">
<h2 class="section-title">Voice Conversion (VC)</h2>
<div class="audio-demos">
<table class="audio-table">
<thead>
<tr>
<th>Source</th>
<th>Reference</th>
<th>Converted</th>
</tr>
</thead>
<tbody>
<tr>
<td><audio controls><source src="./UniTok-audio/AudioSamples/VC/source/1.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/VC/ref/1.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/VC/converted/1.wav" type="audio/wav"></audio></td>
</tr>
<tr>
<td><audio controls><source src="./UniTok-audio/AudioSamples/VC/source/2.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/VC/ref/2.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/VC/converted/2.wav" type="audio/wav"></audio></td>
</tr>
<tr>
<td><audio controls><source src="./UniTok-audio/AudioSamples/VC/source/3.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/VC/ref/3.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/VC/converted/3.wav" type="audio/wav"></audio></td>
</tr>
<tr>
<td><audio controls><source src="./UniTok-audio/AudioSamples/VC/source/4.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/VC/ref/4.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/VC/converted/4.wav" type="audio/wav"></audio></td>
</tr>
</tbody>
</table>
</div>
</div>
<!-- Language-Queried Audio Source Separation (LASS) -->
<div class="section">
<h2 class="section-title">Language-Queried Audio Source Separation (LASS)</h2>
<div class="audio-demos">
<table class="audio-table">
<thead>
<tr>
<th>Text Query</th>
<th>Source</th>
<th>Converted</th>
</tr>
</thead>
<tbody>
<tr>
<td>the crowd is cheering and giving applause</td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/LASS/source/test-real-case-16.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/LASS/convert/test-real-case-16_convert.wav" type="audio/wav"></audio></td>
</tr>
<tr>
<td>someone is beating the drum continuously</td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/LASS/source/test-real-case-68.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/LASS/convert/test-real-case-68_convert.wav" type="audio/wav"></audio></td>
</tr>
<tr>
<td>Someone is typing on a keyboard</td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/LASS/source/SeparatebyText_Y0z97A0GYhAQ_YFSBDt4ENLqE.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/LASS/convert/SeparatebyText_Y0z97A0GYhAQ_YFSBDt4ENLqE_convert.wav" type="audio/wav"></audio></td>
</tr>
<tr>
<td>a person is pressing the shutter button of the camera to check something</td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/LASS/source/SeparatebyText_Y0z97A0GYhAQ_YFSBDt4ENLqE.wav" type="audio/wav"></audio></td>
<td><audio controls><source src="./UniTok-audio/AudioSamples/LASS/convert/SeparatebyText_Y0z97A0GYhAQ_YFSBDt4ENLqE_convert.wav" type="audio/wav"></audio></td>
</tr>
</tbody>
</table>
</div>
</div>
</div>
</body>
</html>