-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.html
1270 lines (1196 loc) · 149 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html>
<head>
<script async src="https://www.googletagmanager.com/gtag/js?id=G-C1CRWDNJ1J"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-C1CRWDNJ1J');
</script>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0"><title>HF. 21 papers. March 4.</title>
<link rel="icon" href="favicon.svg" sizes="any" type="image/svg+xml">
<link href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;700&display=swap" rel="stylesheet">
<link href="https://fonts.googleapis.com/css2?family=Roboto+Slab:wght@100..900&family=Tiny5&display=swap" rel="stylesheet">
<style>
:root {
--primary-color: cornflowerblue;
--primary-color-dark: #fffd87cf;
--secondary-color: #fff;
--background-color: #eee;
--text-color: #333333;
--header-color: cornflowerblue;
--body-color: #eee;
--menu-color: #002370;
}
.background-digit {
position: absolute;
font-family: 'Tiny5';
bottom: -20px;
right: -10px;
font-size: 8em;
font-weight: 400;
color: #0989ea22;
z-index: 2;
line-height: 1;
}
.dark-theme .background-digit {
color: #e9e78f3d;
}
body {
font-family: 'Roboto Slab', sans-serif;
line-height: 1.6;
color: var(--text-color);
margin: 0;
padding: 0;
min-height: 100vh;
display: flex;
flex-direction: column;
}
.container {
max-width: 1500px;
margin: 0 auto;
flex: 1 0 auto;
width: 100%
}
.a-clean {
color: var(--secondary-color);
text-decoration: none;
}
.a-clean:hover {
color: #fff;
}
header {
padding: 3.6em 0 2.4em 0;
text-align: center;
}
footer {
background-color: var(--primary-color);
color: white;
text-align: center;
margin-top: 2em;
flex-shrink: 0;
padding: 20px;
}
h1 {
font-size: 2.4em;
margin: 0;
font-weight: 700;
}
.article-title-cont {
margin: -21px -21px 0px -21px;
padding: 10px 20px;
background: cornflowerblue;
display: table;
min-height: 5.9em;
}
.dark-theme .article-title-cont {
background: #444444;
}
.article-title {
color: white;
}
.article-title h2 {
margin: 0px;
padding: 0px;
font-weight: 400;
text-align:center;
}
h2 {
# color: var(--primary-color);
font-size: 1.2em;
margin-top: 0;
margin-bottom: 0.5em;
}
header p {
font-size: 1.2em;
margin-top: 0.5em;
font-weight: 300;
}
main {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
gap: 1.5em;
padding: 10px 20px 20px 20px;
}
body.dark-tmeme>header {
background-color: background-color: #333333;
color: white;
}
body.dark-theme>div>main>article>div.article-content>p.meta {
color: #fff;
}
body.light-theme>div>main>article>div.article-content>p.meta {
color: #555;
}
body.dark-theme>div>main>article>div.article-content>p.pub-date {
color: #ccc;
}
body.light-theme>div>main>article>div.article-content>p.pub-date {
color: #555;
}
body.dark-theme>div>main>article>div.article-content>div.tags {
color: #ccc;
}
body.light-theme>div>main>article>div.article-content>div.tags {
color: #fff;
}
body.light-theme>header {
background-color: var(--header-color);
color: white;
}
article {
display: flex;
flex-direction: row;
justify-content: center;
}
.article-content {
border-radius: 5px;
border: 1px solid #ddd;
overflow: hidden;
transition: background-color 0.2s ease;
padding: 1.3em;
flex-grow: 1;
display: flex;
flex-direction: column;
position: relative;
z-index: 1;
cursor: pointer;
max-width: 800px;
position: relative;
}
body.dark-theme>div>main>article>div.article-content {
background-color: #444;
border: none;
}
body.light-theme>div>main>article>div.article-content {
background-color: #fff;
}
body.dark-theme>div>main>article>div.article-content:hover {
background-color: #414141;
}
body.light-theme>div>main>article>div.article-content:hover {
background-color: #fafafa;
}
.meta {
font-size: 0.9em;
margin-bottom: 0em;
font-weight: 500;
margin: 20px 0 0px 0;
padding-bottom: 20px;
border-bottom: 1px solid #ddd;
}
.pub-date {
font-size: 0.8em;
margin-bottom: 0.8em;
font-weight: 400;
text-align: right;
font-family: Roboto;
}
.tags {
font-size: 0.9em;
margin-bottom: 0;
position: absolute;
bottom: 0px;
font-weight: 300;
font-family: 'Roboto Slab';
background: #555;
left: 0;
width: 100%;
padding: 10px 20px;
}
.abstract {
position: relative;
max-height: 170px;
overflow: hidden;
transition: max-height 0.3s ease;
cursor: pointer;
}
.abstract.expanded {
max-height: 1000px;
}
.abstract-toggle {
position: absolute;
bottom: 4px;
right: 0;
cursor: pointer;
color: var(--primary-color);
float: right;
font-weight: 400;
}
.explanation {
background-color: #e8f5e9;
border-left: 4px solid var(--secondary-color);
padding: 1em;
margin-top: 1.5em;
}
.links {
margin-top: 1.5em;
margin-bottom: 20px;
}
.affiliations {
margin-bottom: 50px;
padding:10px;
font-size: 0.9em;
text-align: center
}
a {
color: var(--primary-color);
text-decoration: none;
font-weight: 500;
transition: color 0.3s ease;
}
.dark-theme a {
color: var(--primary-color-dark);
}
a:hover {
color: #e73838;
}
.light-theme {
background-color: var(--body-color);
color: #333333;
}
.dark-theme {
background-color: #333333;
color: #ffffff;
}
.theme-switch {
position: absolute;
top: 20px;
right: 20px;
display: flex;
align-items: center;
}
.switch {
position: relative;
display: inline-block;
width: 50px;
height: 30px;
}
.switch input {
opacity: 0;
width: 0;
height: 0;
}
.slider {
position: absolute;
cursor: pointer;
top: 0;
left: 0;
right: 0;
bottom: 0;
background-color: #ccc;
transition: .4s;
border-radius: 30px;
}
.slider:before {
position: absolute;
content: "";
height: 24px;
width: 24px;
left: 3px;
bottom: 3px;
background-color: white;
transition: .4s;
border-radius: 50%;
}
input:checked + .slider {
background-color: var(--primary-color);
}
input:checked + .slider:before {
transform: translateX(20px);
}
.switch-label {
margin-right: 10px;
}
.sub-header-container {
display: flex;
justify-content: space-between;
align-items: center;
flex-wrap: wrap;
gap: 15px;
margin-top: 7px;
padding: 0 20px;
}
.sub-header-container-2 {
display: flex;
justify-content: left;
align-items: center;
flex-wrap: wrap;
gap: 15px;
margin: 0 auto;
padding: 0 20px;
}
.update-info-container {
margin-top: 15px;
margin-bottom: 0px;
text-align: left;
flex: 1;
}
.sort-container {
margin-top: 15px;
margin-bottom: 0px;
text-align: right;
flex: 2;
}
.category-toggle-container {
display: inline-block;
margin-top: 15px;
margin-bottom: 10px;
cursor: pointer;
}
.category-option-container {
margin-top: 15px;
margin-bottom: 10px;
display: none;
margin-left: auto;
}
.category-option-container.expanded {
display: block;
}
.sort-dropdown {
padding: 5px 10px;
font-size: 16px;
border-radius: 5px;
border: 1px solid #ccc;
background-color: white;
color: var(--text-color);
font-family: 'Roboto Slab', sans-serif;
}
.sort-label {
margin-right: 10px;
font-size: 1.0em !important;
}
.dark-theme .sort-dropdown {
background-color: #444;
color: white;
border-color: var(--text-color);
}
.title-sign {
display: inline-block;
transition: all 0.5s ease;
}
.rotate {
transform: rotate(45deg) translateY(-6px);
transform-origin: center;
}
.title-text {
display: inline;
padding-left: 10px;
}
.summary_title {
font-size: 1.2em;
font-weight: bold;
color: #222;
margin-bottom: 5px;
}
.summary_text {
}
.summary_image {
max-height: 500px;
max-width: 100%;
align: center;
margin-top: 40px;
margin-bottom: 60px;
}
.category-filters {
margin-top: 20px;
margin-bottom: 20px;
text-align: center;
display: none;
}
.category-filters.expanded {
display: block;
margin-top: 10px;
}
.category-button {
display: inline-block;
margin: 5px;
padding: 5px 10px;
border-radius: 15px;
background-color: #f0f0f0;
color: #333;
cursor: pointer;
transition: background-color 0.3s ease;
}
.category-button.active {
background-color: var(--primary-color);
color: white;
}
.category-button.inactive:not(.active) {
color: #ccc;
}
.dark-theme .category-button {
background-color: #555;
color: #fff;
}
.dark-theme .category-button.active {
background-color: var(--primary-color);
}
.dark-theme .category-button.inactive:not(.active) {
color: #888;
}
.clear-categories {
display: inline-block;
margin: 5px;
padding: 5px 10px;
border-radius: 15px;
background-color: #f0f0f0;
color: #333;
cursor: pointer;
transition: background-color 0.3s ease;
}
.clear-categories:hover {
background-color: #bbb;
}
.svg-container {
display: inline-block;
position: relative;
overflow: hidden;
}
.svg-container span {
position: relative;
z-index: 1;
}
.svg-container svg {
position: absolute;
bottom: 0;
left: 0;
z-index: 0;
}
.nav-menu {
background-color: var(--menu-color);
padding: 2px 0 2px 0;
display: inline-block;
position: relative;
overflow: hidden;
width: 100%;
}
.nav-container {
max-width: 1500px;
margin: 0 auto;
display: flex;
justify-content: left;
gap: 3em;
}
.nav-container span a {
color: white;
}
.nav-item {
color: white;
padding: 3px 0px;
cursor: pointer;
font-weight: 400;
}
.nav-prev {
margin-left: 20px;
}
.nav-item:hover {
background-color: rgba(255, 255, 255, 0.1);
border-color: rgba(255, 255, 255, 0.3);
}
.language-flags {
display: flex;
gap: 7px;
padding: 5px 20px 0 0;
margin-left: auto;
}
.flag-svg {
width: 22px;
height: 22px;
cursor: pointer;
opacity: 0.4;
transition: opacity 0.3s ease;
border-radius: 2px;
}
.flag-svg.active {
opacity: 1;
}
.flag-svg:hover {
opacity: 0.8;
}
.dark-theme .nav-menu {
background-color: #333;
}
.dark-theme .nav-item {
color: white;
}
.dark-theme .nav-item:hover {
background-color: rgba(255, 255, 255, 0.05);
}
.pointer { cursor: pointer; }
.article-pdf-title-img {
max-width: 100%;
max-height: 400px;
display: inline-block;
margin-top: 10px;
margin-bottom: 10px;
border-radius: 5px;
}
.article-pdf-title-img-cont {
text-align: center;
}
.dark-theme .article-pdf-title-img {
opacity: 0.8;
filter: grayscale(1);
}
@media (max-width: 600px) {
.nav-container {
flex-direction: row;
gap: 1.5em;
}
.nav-item {
padding: 3px 0px;
}
}
@media (max-width: 768px) {
.category-filters {
display: none;
}
.category-toggle {
display: inline-block;
width: 100%;
text-align: left;
}
.category-filters.expanded {
display: block;
margin-top: 10px;
}
}
@media (max-width: 600px) {
.sub-header-container {
flex-direction: column;
align-items: flex-start;
}
.sort-container {
width: 100%;
display: flex;
justify-content: left;
margin: 0 auto;
}
.sort-dropdown {
margin-left: auto;
}
.sort-label {
margin-top: 5px;
float: left;
}
.sub-header-container-2 {
flex-direction: row;
align-items: flex-start;
}
.update-info-container {
text-align: left;
width: 100%;
margin-bottom: 0px;
}
.category-toggle-container {
margin-top: 15px;
text-align: left;
margin-bottom: 10px;
}
.category-option-container {
margin-top: 15px;
text-align: center;
margin-bottom: 10px;
}
main {
grid-template-columns: repeat(auto-fit);
gap: 0em;
padding: 10px 0 20px 0;
}
footer {
margin-top: -20px;
}
article>div.article-content {
border-radius: 0px;
}
}
</style>
<script>
function toggleAbstract(id) {
var abstract = document.getElementById('abstract-' + id);
var toggle = document.getElementById('toggle-' + id);
if (abstract.classList.contains('expanded')) {
abstract.classList.remove('expanded');
toggle.textContent = '...';
} else {
abstract.classList.add('expanded');
toggle.textContent = '';
}
}
function getTimeDiff(dateString, lang='ru') {
const timeUnits = {
ru: {
minute: ["минуту", "минуты", "минут"],
hour: ["час", "часа", "часов"],
day: ["день", "дня", "дней"],
justNow: "только что",
ago: "назад"
},
en: {
minute: ["minute", "minutes", "minutes"],
hour: ["hour", "hours", "hours"],
day: ["day", "days", "days"],
justNow: "just now",
ago: "ago"
},
zh: {
minute: ["分钟", "分钟", "分钟"],
hour: ["小时", "小时", "小时"],
day: ["天", "天", "天"],
justNow: "刚刚",
ago: "前"
}
};
function getPlural(number, words, lang) {
if (lang === 'ru') {
if (number % 10 === 1 && number % 100 !== 11) {
return words[0];
} else if (number % 10 >= 2 && number % 10 <= 4 && (number % 100 < 10 || number % 100 >= 20)) {
return words[1];
} else {
return words[2];
}
} else if (lang === 'en') {
return number === 1 ? words[0] : words[1];
} else {
// Chinese doesn't need plural forms
return words[0];
}
}
function formatTimeDiff(number, unit, lang) {
const unitWord = getPlural(number, timeUnits[lang][unit], lang);
if (lang === 'zh') {
return `${number}${unitWord}${timeUnits[lang].ago}`;
} else {
return `${number} ${unitWord} ${timeUnits[lang].ago}`;
}
}
if (!['ru', 'en', 'zh'].includes(lang)) {
throw new Error('Unsupported language. Supported languages are: ru, en, zh');
}
const pastDate = new Date(dateString.replace(" ", "T") + ":00Z");
const currentDate = new Date();
const diffInSeconds = Math.floor((currentDate - pastDate) / 1000);
const minutes = Math.floor(diffInSeconds / 60);
const hours = Math.floor(diffInSeconds / 3600);
const days = Math.floor(diffInSeconds / 86400);
if (minutes === 0) {
return timeUnits[lang].justNow;
} else if (minutes < 60) {
return formatTimeDiff(minutes, 'minute', lang);
} else if (hours < 24) {
return formatTimeDiff(hours, 'hour', lang);
} else {
return formatTimeDiff(days, 'day', lang);
}
}
function isToday(dateString) {
const inputDate = new Date(dateString);
const today = new Date();
return (
inputDate.getFullYear() === today.getFullYear() &&
inputDate.getMonth() === today.getMonth() &&
inputDate.getDate() === today.getDate()
);
}
function isCurrentMonth(dateString) {
const inputDate = new Date(dateString);
const today = new Date();
return (
inputDate.getFullYear() === today.getFullYear() &&
inputDate.getMonth() === today.getMonth()
);
}
function formatArticlesTitle(number, lang='ru') {
const lastDigit = number % 10;
const lastTwoDigits = number % 100;
let word;
if (!['ru', 'en', 'zh'].includes(lang)) {
throw new Error('Unsupported language. Supported languages are: ru, en, zh');
}
if (lang === 'ru') {
if (lastTwoDigits >= 11 && lastTwoDigits <= 14) {
word = "статей";
} else if (lastDigit === 1) {
word = "статья";
} else if (lastDigit >= 2 && lastDigit <= 4) {
word = "статьи";
} else {
word = "статей";
}
} else if (lang === 'en') {
if (number === 1) {
word = 'paper'
} else {
word = 'papers'
}
} else if (lang === 'zh') {
word = "篇论文"
}
if (lang === 'zh') {
return `${number}${word}`;
} else {
return `${number} ${word}`;
}
}
</script>
</head>
<body class="light-theme">
<header>
<div class="container">
<a href="https://hfday.ru" class="a-clean"><h1 class="title-sign" id="doomgrad-icon">🔺</h1><h1 class="title-text" id="doomgrad">hf daily</h1></a>
<p><span id="title-date">4 марта</span> | <span id="title-articles-count">21 papers</span></p>
</div>
<div class="theme-switch">
<label class="switch">
<input type="checkbox" id="theme-toggle">
<span class="slider"></span>
</label>
</div>
</header>
<div class="nav-menu">
<div class="nav-container">
<span class="nav-item nav-prev" id="nav-prev"><a href="/d/2025-03-03.html">⬅️ <span id="prev-date">03.03</span></a></span>
<span class="nav-item" id="nav-next"><a href="/d/2025-03-05.html">➡️ <span id="next-date">05.03</span></a></span>
<span class="nav-item" id="nav-monthly"><a href="/m/2025-03.html">📈 <span id='top-month-label'>Месяц</span></a></span>
<div class="language-flags">
<svg class="flag-svg" data-lang="ru" xmlns="http://www.w3.org/2000/svg" width="32" height="32" viewBox="0 0 32 32"><path fill="#1435a1" d="M1 11H31V21H1z"></path><path d="M5,4H27c2.208,0,4,1.792,4,4v4H1v-4c0-2.208,1.792-4,4-4Z" fill="#fff"></path><path d="M5,20H27c2.208,0,4,1.792,4,4v4H1v-4c0-2.208,1.792-4,4-4Z" transform="rotate(180 16 24)" fill="#c53a28"></path><path d="M27,4H5c-2.209,0-4,1.791-4,4V24c0,2.209,1.791,4,4,4H27c2.209,0,4-1.791,4-4V8c0-2.209-1.791-4-4-4Zm3,20c0,1.654-1.346,3-3,3H5c-1.654,0-3-1.346-3-3V8c0-1.654,1.346-3,3-3H27c1.654,0,3,1.346,3,3V24Z" opacity=".15"></path><path d="M27,5H5c-1.657,0-3,1.343-3,3v1c0-1.657,1.343-3,3-3H27c1.657,0,3,1.343,3,3v-1c0-1.657-1.343-3-3-3Z" fill="#fff" opacity=".2"></path></svg>
<svg class="flag-svg" data-lang="zh" xmlns="http://www.w3.org/2000/svg" width="32" height="32" viewBox="0 0 32 32"><rect x="1" y="4" width="30" height="24" rx="4" ry="4" fill="#db362f"></rect><path d="M27,4H5c-2.209,0-4,1.791-4,4V24c0,2.209,1.791,4,4,4H27c2.209,0,4-1.791,4-4V8c0-2.209-1.791-4-4-4Zm3,20c0,1.654-1.346,3-3,3H5c-1.654,0-3-1.346-3-3V8c0-1.654,1.346-3,3-3H27c1.654,0,3,1.346,3,3V24Z" opacity=".15"></path><path fill="#ff0" d="M7.958 10.152L7.19 7.786 6.421 10.152 3.934 10.152 5.946 11.614 5.177 13.979 7.19 12.517 9.202 13.979 8.433 11.614 10.446 10.152 7.958 10.152z"></path><path fill="#ff0" d="M12.725 8.187L13.152 8.898 13.224 8.072 14.032 7.886 13.269 7.562 13.342 6.736 12.798 7.361 12.035 7.037 12.461 7.748 11.917 8.373 12.725 8.187z"></path><path fill="#ff0" d="M14.865 10.372L14.982 11.193 15.37 10.46 16.187 10.602 15.61 10.007 15.997 9.274 15.253 9.639 14.675 9.044 14.793 9.865 14.048 10.23 14.865 10.372z"></path><path fill="#ff0" d="M15.597 13.612L16.25 13.101 15.421 13.13 15.137 12.352 14.909 13.149 14.081 13.179 14.769 13.642 14.541 14.439 15.194 13.928 15.881 14.391 15.597 13.612z"></path><path fill="#ff0" d="M13.26 15.535L13.298 14.707 12.78 15.354 12.005 15.062 12.46 15.754 11.942 16.402 12.742 16.182 13.198 16.875 13.236 16.047 14.036 15.827 13.26 15.535z"></path><path d="M27,5H5c-1.657,0-3,1.343-3,3v1c0-1.657,1.343-3,3-3H27c1.657,0,3,1.343,3,3v-1c0-1.657-1.343-3-3-3Z" fill="#fff" opacity=".2"></path></svg>
<svg class="flag-svg" data-lang="en" xmlns="http://www.w3.org/2000/svg" width="32" height="32" viewBox="0 0 32 32"><rect x="1" y="4" width="30" height="24" rx="4" ry="4" fill="#fff"></rect><path d="M1.638,5.846H30.362c-.711-1.108-1.947-1.846-3.362-1.846H5c-1.414,0-2.65,.738-3.362,1.846Z" fill="#a62842"></path><path d="M2.03,7.692c-.008,.103-.03,.202-.03,.308v1.539H31v-1.539c0-.105-.022-.204-.03-.308H2.03Z" fill="#a62842"></path><path fill="#a62842" d="M2 11.385H31V13.231H2z"></path><path fill="#a62842" d="M2 15.077H31V16.923000000000002H2z"></path><path fill="#a62842" d="M1 18.769H31V20.615H1z"></path><path d="M1,24c0,.105,.023,.204,.031,.308H30.969c.008-.103,.031-.202,.031-.308v-1.539H1v1.539Z" fill="#a62842"></path><path d="M30.362,26.154H1.638c.711,1.108,1.947,1.846,3.362,1.846H27c1.414,0,2.65-.738,3.362-1.846Z" fill="#a62842"></path><path d="M5,4h11v12.923H1V8c0-2.208,1.792-4,4-4Z" fill="#102d5e"></path><path d="M27,4H5c-2.209,0-4,1.791-4,4V24c0,2.209,1.791,4,4,4H27c2.209,0,4-1.791,4-4V8c0-2.209-1.791-4-4-4Zm3,20c0,1.654-1.346,3-3,3H5c-1.654,0-3-1.346-3-3V8c0-1.654,1.346-3,3-3H27c1.654,0,3,1.346,3,3V24Z" opacity=".15"></path><path d="M27,5H5c-1.657,0-3,1.343-3,3v1c0-1.657,1.343-3,3-3H27c1.657,0,3,1.343,3,3v-1c0-1.657-1.343-3-3-3Z" fill="#fff" opacity=".2"></path><path fill="#fff" d="M4.601 7.463L5.193 7.033 4.462 7.033 4.236 6.338 4.01 7.033 3.279 7.033 3.87 7.463 3.644 8.158 4.236 7.729 4.827 8.158 4.601 7.463z"></path><path fill="#fff" d="M7.58 7.463L8.172 7.033 7.441 7.033 7.215 6.338 6.989 7.033 6.258 7.033 6.849 7.463 6.623 8.158 7.215 7.729 7.806 8.158 7.58 7.463z"></path><path fill="#fff" d="M10.56 7.463L11.151 7.033 10.42 7.033 10.194 6.338 9.968 7.033 9.237 7.033 9.828 7.463 9.603 8.158 10.194 7.729 10.785 8.158 10.56 7.463z"></path><path fill="#fff" d="M6.066 9.283L6.658 8.854 5.927 8.854 5.701 8.158 5.475 8.854 4.744 8.854 5.335 9.283 5.109 9.979 5.701 9.549 6.292 9.979 6.066 9.283z"></path><path fill="#fff" d="M9.046 9.283L9.637 8.854 8.906 8.854 8.68 8.158 8.454 8.854 7.723 8.854 8.314 9.283 8.089 9.979 8.68 9.549 9.271 9.979 9.046 9.283z"></path><path fill="#fff" d="M12.025 9.283L12.616 8.854 11.885 8.854 11.659 8.158 11.433 8.854 10.702 8.854 11.294 9.283 11.068 9.979 11.659 9.549 12.251 9.979 12.025 9.283z"></path><path fill="#fff" d="M6.066 12.924L6.658 12.494 5.927 12.494 5.701 11.799 5.475 12.494 4.744 12.494 5.335 12.924 5.109 13.619 5.701 13.19 6.292 13.619 6.066 12.924z"></path><path fill="#fff" d="M9.046 12.924L9.637 12.494 8.906 12.494 8.68 11.799 8.454 12.494 7.723 12.494 8.314 12.924 8.089 13.619 8.68 13.19 9.271 13.619 9.046 12.924z"></path><path fill="#fff" d="M12.025 12.924L12.616 12.494 11.885 12.494 11.659 11.799 11.433 12.494 10.702 12.494 11.294 12.924 11.068 13.619 11.659 13.19 12.251 13.619 12.025 12.924z"></path><path fill="#fff" d="M13.539 7.463L14.13 7.033 13.399 7.033 13.173 6.338 12.947 7.033 12.216 7.033 12.808 7.463 12.582 8.158 13.173 7.729 13.765 8.158 13.539 7.463z"></path><path fill="#fff" d="M4.601 11.104L5.193 10.674 4.462 10.674 4.236 9.979 4.01 10.674 3.279 10.674 3.87 11.104 3.644 11.799 4.236 11.369 4.827 11.799 4.601 11.104z"></path><path fill="#fff" d="M7.58 11.104L8.172 10.674 7.441 10.674 7.215 9.979 6.989 10.674 6.258 10.674 6.849 11.104 6.623 11.799 7.215 11.369 7.806 11.799 7.58 11.104z"></path><path fill="#fff" d="M10.56 11.104L11.151 10.674 10.42 10.674 10.194 9.979 9.968 10.674 9.237 10.674 9.828 11.104 9.603 11.799 10.194 11.369 10.785 11.799 10.56 11.104z"></path><path fill="#fff" d="M13.539 11.104L14.13 10.674 13.399 10.674 13.173 9.979 12.947 10.674 12.216 10.674 12.808 11.104 12.582 11.799 13.173 11.369 13.765 11.799 13.539 11.104z"></path><path fill="#fff" d="M4.601 14.744L5.193 14.315 4.462 14.315 4.236 13.619 4.01 14.315 3.279 14.315 3.87 14.744 3.644 15.44 4.236 15.01 4.827 15.44 4.601 14.744z"></path><path fill="#fff" d="M7.58 14.744L8.172 14.315 7.441 14.315 7.215 13.619 6.989 14.315 6.258 14.315 6.849 14.744 6.623 15.44 7.215 15.01 7.806 15.44 7.58 14.744z"></path><path fill="#fff" d="M10.56 14.744L11.151 14.315 10.42 14.315 10.194 13.619 9.968 14.315 9.237 14.315 9.828 14.744 9.603 15.44 10.194 15.01 10.785 15.44 10.56 14.744z"></path><path fill="#fff" d="M13.539 14.744L14.13 14.315 13.399 14.315 13.173 13.619 12.947 14.315 12.216 14.315 12.808 14.744 12.582 15.44 13.173 15.01 13.765 15.44 13.539 14.744z"></path></svg>
</div>
</div>
</div>
<div class="container">
<div class="sub-header-container">
<div class="update-info-container">
<label class="update-info-label" id="timeDiff"></label>
</div>
<div class="sort-container">
<label class="sort-label">🔀 <span id="sort-label-text">Сортировка по</span></label>
<select id="sort-dropdown" class="sort-dropdown">
<option value="default">рейтингу</option>
<option value="pub_date">дате публикации</option>
<option value="issue_id">добавлению на HF</option>
</select>
</div>
</div>
<div class="sub-header-container-2">
<div class="category-toggle-container">
<div class="svg-container">
<span id="category-toggle">🏷️ Фильтр</span>
<svg height="3" width="200">
<line x1="0" y1="0" x2="200" y2="0"
stroke="black"
stroke-width="2"
stroke-dasharray="3, 3" />
</svg>
</div>
</div>
<div class="category-option-container" id="category-options">
<label class="pointer" for="filter-logic-or"><input type="radio" id="filter-logic-or" name="filter-logic" value="or"> A∪B</label>
<label class="pointer" for="filter-logic-and"><input type="radio" id="filter-logic-and" name="filter-logic" value="and"> A∩B</label>
</div>
</div>
<div class="category-filters" id="category-filters">
<span class="clear-categories" id="clear-categories">🧹</span>
<!-- Categories -->
</div>
<main id="articles-container">
<!-- Articles -->
</main>
</div>
<footer>
<div class="container">
<p><a style="color:white;" href="https://t.me/doomgrad">doomgrad</a> ✖️ <a style="color:white;" href="https://huggingface.co/papers">hugging face</a></p>
</div>
</footer>
<script>
// Language handling
let currentLang = localStorage.getItem('selectedLang') || 'en';
let feedDate = {'ru': '4 марта', 'en': 'March 4', 'zh': '3月4日'};
let feedDateNext = {'ru': '05.03', 'en': '03/05', 'zh': '3月5日'};
let feedDatePrev = {'ru': '03.03', 'en': '03/03', 'zh': '3月3日'};
let filterLabel = {'ru': 'Фильтр', 'en': 'Topics', 'zh': '主题筛选'}
let publishedLabel = {'ru': 'статья от ', 'en': 'published on ', 'zh': '发表于'}
let sortLabel = {'ru': 'Сортировка по', 'en': 'Sort by', 'zh': '排序方式'}
let paperLabel = {'ru': 'Статья', 'en': 'Paper', 'zh': '论文'}
let topMonthLabel = {'ru': 'Месяц', 'en': 'Month', 'zh': '月度论文'}
let topDayLabel = {'ru': 'День', 'en': 'Day', 'zh': '日度论文'}
function initializeLanguageFlags() {
const flags = document.querySelectorAll('.flag-svg');
flags.forEach(flag => {
if (flag.dataset.lang === currentLang) {
flag.classList.add('active');
}
flag.addEventListener('click', () => {
flags.forEach(f => f.classList.remove('active'));
flag.classList.add('active');
currentLang = flag.dataset.lang;
localStorage.setItem('selectedLang', currentLang);
updateTimeDiffs();
updateLocalization();
filterAndRenderArticles();
});
});
}
function toggleTheme() {
const body = document.body;
body.classList.toggle('light-theme');
body.classList.toggle('dark-theme');
const isDarkMode = body.classList.contains('dark-theme');
localStorage.setItem('darkMode', isDarkMode);
if (isDarkMode) {
const title = document.getElementById('doomgrad');
title.innerHTML = "hf nightly";
const titleSign = document.getElementById('doomgrad-icon');
titleSign.classList.add('rotate');
} else {
const title = document.getElementById('doomgrad');
title.innerHTML = "hf daily";
const titleSign = document.getElementById('doomgrad-icon');
titleSign.classList.remove('rotate');
}
}
const articlesData = [{'id': 'https://huggingface.co/papers/2503.01785', 'title': 'Visual-RFT: Visual Reinforcement Fine-Tuning', 'url': 'https://huggingface.co/papers/2503.01785', 'abstract': "Reinforcement Fine-Tuning (RFT) in Large Reasoning Models like OpenAI o1 learns from feedback on its answers, which is especially useful in applications when fine-tuning data is scarce. Recent open-source work like DeepSeek-R1 demonstrates that reinforcement learning with verifiable reward is one key direction in reproducing o1. While the R1-style model has demonstrated success in language models, its application in multi-modal domains remains under-explored. This work introduces Visual Reinforcement Fine-Tuning (Visual-RFT), which further extends the application areas of RFT on visual tasks. Specifically, Visual-RFT first uses Large Vision-Language Models (LVLMs) to generate multiple responses containing reasoning tokens and final answers for each input, and then uses our proposed visual perception verifiable reward functions to update the model via the policy optimization algorithm such as Group Relative Policy Optimization (GRPO). We design different verifiable reward functions for different perception tasks, such as the Intersection over Union (IoU) reward for object detection. Experimental results on fine-grained image classification, few-shot object detection, reasoning grounding, as well as open-vocabulary object detection benchmarks show the competitive performance and advanced generalization ability of Visual-RFT compared with Supervised Fine-tuning (SFT). For example, Visual-RFT improves accuracy by 24.3% over the baseline in one-shot fine-grained image classification with around 100 samples. In few-shot object detection, Visual-RFT also exceeds the baseline by 21.9 on COCO's two-shot setting and 15.4 on LVIS. Our Visual-RFT represents a paradigm shift in fine-tuning LVLMs, offering a data-efficient, reward-driven approach that enhances reasoning and adaptability for domain-specific tasks.", 'score': 33, 'issue_id': 2511, 'pub_date': '2025-03-03', 'pub_date_card': {'ru': '3 марта', 'en': 'March 3', 'zh': '3月3日'}, 'hash': 'ef2e10eb59ab7743', 'authors': ['Ziyu Liu', 'Zeyi Sun', 'Yuhang Zang', 'Xiaoyi Dong', 'Yuhang Cao', 'Haodong Duan', 'Dahua Lin', 'Jiaqi Wang'], 'affiliations': ['Shanghai Artificial Intelligence Laboratory', 'Shanghai Jiaotong University', 'The Chinese University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2503.01785.jpg', 'data': {'categories': ['#multimodal', '#open_source', '#cv', '#optimization', '#rlhf', '#reasoning', '#training', '#rl'], 'emoji': '🔬', 'ru': {'title': 'Visual-RFT: Революция в тонкой настройке визуально-языковых моделей', 'desc': 'Статья представляет Visual Reinforcement Fine-Tuning (Visual-RFT) - метод, расширяющий применение обучения с подкреплением в визуальных задачах. Visual-RFT использует большие визуально-языковые модели для генерации ответов с токенами рассуждений и применяет визуально верифицируемые функции вознаграждения для обновления модели. Эксперименты показывают превосходство Visual-RFT над методом Supervised Fine-tuning в задачах классификации изображений, обнаружения объектов и обоснованного заземления. Метод демонстрирует значительное улучшение точности и обобщающей способности при ограниченном количестве обучающих примеров.'}, 'en': {'title': 'Revolutionizing Visual Learning with Reinforcement Fine-Tuning', 'desc': "This paper introduces Visual Reinforcement Fine-Tuning (Visual-RFT), a method that enhances large vision-language models (LVLMs) by using reinforcement learning to improve their performance on visual tasks. Visual-RFT generates multiple responses for each input and employs verifiable reward functions to optimize the model's policy, making it particularly effective in scenarios with limited fine-tuning data. The approach demonstrates significant improvements in tasks like fine-grained image classification and object detection, outperforming traditional supervised fine-tuning methods. Overall, Visual-RFT represents a novel, efficient way to fine-tune LVLMs, focusing on reasoning and adaptability in specific domains."}, 'zh': {'title': '视觉强化微调:提升推理与适应性的创新方法', 'desc': '强化微调(RFT)在大型推理模型中通过反馈学习,特别适用于微调数据稀缺的应用场景。本文提出的视觉强化微调(Visual-RFT)扩展了RFT在视觉任务中的应用,利用大型视觉语言模型生成多种响应,并通过可验证的视觉感知奖励函数进行模型更新。实验结果表明,Visual-RFT在细粒度图像分类和少样本目标检测等任务中表现出色,相较于传统的监督微调(SFT)方法,准确率显著提高。Visual-RFT代表了一种新的微调范式,提供了一种数据高效、以奖励驱动的方法,增强了模型在特定领域任务中的推理能力和适应性。'}}}, {'id': 'https://huggingface.co/papers/2503.01774', 'title': 'Difix3D+: Improving 3D Reconstructions with Single-Step Diffusion Models', 'url': 'https://huggingface.co/papers/2503.01774', 'abstract': 'Neural Radiance Fields and 3D Gaussian Splatting have revolutionized 3D reconstruction and novel-view synthesis task. However, achieving photorealistic rendering from extreme novel viewpoints remains challenging, as artifacts persist across representations. In this work, we introduce Difix3D+, a novel pipeline designed to enhance 3D reconstruction and novel-view synthesis through single-step diffusion models. At the core of our approach is Difix, a single-step image diffusion model trained to enhance and remove artifacts in rendered novel views caused by underconstrained regions of the 3D representation. Difix serves two critical roles in our pipeline. First, it is used during the reconstruction phase to clean up pseudo-training views that are rendered from the reconstruction and then distilled back into 3D. This greatly enhances underconstrained regions and improves the overall 3D representation quality. More importantly, Difix also acts as a neural enhancer during inference, effectively removing residual artifacts arising from imperfect 3D supervision and the limited capacity of current reconstruction models. Difix3D+ is a general solution, a single model compatible with both NeRF and 3DGS representations, and it achieves an average 2times improvement in FID score over baselines while maintaining 3D consistency.', 'score': 26, 'issue_id': 2512, 'pub_date': '2025-03-03', 'pub_date_card': {'ru': '3 марта', 'en': 'March 3', 'zh': '3月3日'}, 'hash': '39af2f882aef9afb', 'authors': ['Jay Zhangjie Wu', 'Yuxuan Zhang', 'Haithem Turki', 'Xuanchi Ren', 'Jun Gao', 'Mike Zheng Shou', 'Sanja Fidler', 'Zan Gojcic', 'Huan Ling'], 'affiliations': ['NVIDIA', 'National University of Singapore', 'University of Toronto', 'Vector Institute'], 'pdf_title_img': 'assets/pdf/title_img/2503.01774.jpg', 'data': {'categories': ['#3d', '#diffusion'], 'emoji': '🖼️', 'ru': {'title': 'Одношаговая диффузия для фотореалистичной 3D-реконструкции', 'desc': 'Difix3D+ - это новый подход к улучшению 3D-реконструкции и синтеза изображений с новых ракурсов. В его основе лежит Difix - одношаговая модель диффузии изображений, обученная улучшать и устранять артефакты в визуализированных видах. Difix используется как на этапе реконструкции для очистки псевдо-обучающих видов, так и во время вывода для устранения остаточных артефактов. Difix3D+ совместим с представлениями NeRF и 3DGS и показывает двукратное улучшение оценки FID по сравнению с базовыми моделями.'}, 'en': {'title': 'Enhancing 3D Reconstruction with Difix3D+', 'desc': 'This paper presents Difix3D+, a new method for improving 3D reconstruction and novel-view synthesis using single-step diffusion models. The core component, Difix, is an image diffusion model that enhances rendered views by removing artifacts caused by underconstrained areas in 3D representations. It plays a dual role by cleaning up pseudo-training views during reconstruction and acting as a neural enhancer during inference to eliminate residual artifacts. Difix3D+ is versatile, working with both Neural Radiance Fields (NeRF) and 3D Gaussian Splatting (3DGS), and it significantly improves the quality of 3D representations, achieving a 2x better FID score compared to existing methods.'}, 'zh': {'title': 'Difix3D+: 提升3D重建与新视角合成的利器', 'desc': 'Neural Radiance Fields(NeRF)和3D高斯点云(3D Gaussian Splatting)在3D重建和新视角合成任务中取得了重大进展。然而,从极端新视角实现真实感渲染仍然面临挑战,因为在表示中存在伪影。我们提出了Difix3D+,这是一种新颖的管道,旨在通过单步扩散模型增强3D重建和新视角合成。Difix作为核心模型,能够在重建阶段清理伪训练视图,并在推理阶段去除残留伪影,从而显著提高3D表示的质量。'}}}, {'id': 'https://huggingface.co/papers/2503.01743', 'title': 'Phi-4-Mini Technical Report: Compact yet Powerful Multimodal Language Models via Mixture-of-LoRAs', 'url': 'https://huggingface.co/papers/2503.01743', 'abstract': 'We introduce Phi-4-Mini and Phi-4-Multimodal, compact yet highly capable language and multimodal models. Phi-4-Mini is a 3.8-billion-parameter language model trained on high-quality web and synthetic data, significantly outperforming recent open-source models of similar size and matching the performance of models twice its size on math and coding tasks requiring complex reasoning. This achievement is driven by a carefully curated synthetic data recipe emphasizing high-quality math and coding datasets. Compared to its predecessor, Phi-3.5-Mini, Phi-4-Mini features an expanded vocabulary size of 200K tokens to better support multilingual applications, as well as group query attention for more efficient long-sequence generation. Phi-4-Multimodal is a multimodal model that integrates text, vision, and speech/audio input modalities into a single model. Its novel modality extension approach leverages LoRA adapters and modality-specific routers to allow multiple inference modes combining various modalities without interference. For example, it now ranks first in the OpenASR leaderboard to date, although the LoRA component of the speech/audio modality has just 460 million parameters. Phi-4-Multimodal supports scenarios involving (vision + language), (vision + speech), and (speech/audio) inputs, outperforming larger vision-language and speech-language models on a wide range of tasks. Additionally, we experiment to further train Phi-4-Mini to enhance its reasoning capabilities. Despite its compact 3.8-billion-parameter size, this experimental version achieves reasoning performance on par with or surpassing significantly larger models, including DeepSeek-R1-Distill-Qwen-7B and DeepSeek-R1-Distill-Llama-8B.', 'score': 23, 'issue_id': 2511, 'pub_date': '2025-03-03', 'pub_date_card': {'ru': '3 марта', 'en': 'March 3', 'zh': '3月3日'}, 'hash': 'fb054d6547a4a4fb', 'authors': ['Abdelrahman Abouelenin', 'Atabak Ashfaq', 'Adam Atkinson', 'Hany Awadalla', 'Nguyen Bach', 'Jianmin Bao', 'Alon Benhaim', 'Martin Cai', 'Vishrav Chaudhary', 'Congcong Chen', 'Dong Chen', 'Dongdong Chen', 'Junkun Chen', 'Weizhu Chen', 'Yen-Chun Chen', 'Yi-ling Chen', 'Qi Dai', 'Xiyang Dai', 'Ruchao Fan', 'Mei Gao', 'Min Gao', 'Amit Garg', 'Abhishek Goswami', 'Junheng Hao', 'Amr Hendy', 'Yuxuan Hu', 'Xin Jin', 'Mahmoud Khademi', 'Dongwoo Kim', 'Young Jin Kim', 'Gina Lee', 'Jinyu Li', 'Yunsheng Li', 'Chen Liang', 'Xihui Lin', 'Zeqi Lin', 'Mengchen Liu', 'Yang Liu', 'Gilsinia Lopez', 'Chong Luo', 'Piyush Madan', 'Vadim Mazalov', 'Ali Mousavi', 'Anh Nguyen', 'Jing Pan', 'Daniel Perez-Becker', 'Jacob Platin', 'Thomas Portet', 'Kai Qiu', 'Bo Ren', 'Liliang Ren', 'Sambuddha Roy', 'Ning Shang', 'Yelong Shen', 'Saksham Singhal', 'Subhojit Som', 'Xia Song', 'Tetyana Sych', 'Praneetha Vaddamanu', 'Shuohang Wang', 'Yiming Wang', 'Zhenghao Wang', 'Haibin Wu', 'Haoran Xu', 'Weijian Xu', 'Yifan Yang', 'Ziyi Yang', 'Donghan Yu', 'Ishmam Zabir', 'Jianwen Zhang', 'Li Lyna Zhang', 'Yunan Zhang', 'Xiren Zhou'], 'affiliations': ['Microsoft'], 'pdf_title_img': 'assets/pdf/title_img/2503.01743.jpg', 'data': {'categories': ['#multimodal', '#small_models', '#data', '#agi', '#synthetic', '#long_context', '#optimization', '#dataset', '#training'], 'emoji': '🧠', 'ru': {'title': 'Компактные модели с большими возможностями: прорыв в эффективности ИИ', 'desc': 'Представлены две новые модели: Phi-4-Mini и Phi-4-Multimodal. Phi-4-Mini - это языковая модель с 3,8 миллиардами параметров, обученная на высококачественных веб-данных и синтетических данных, которая превосходит аналогичные модели в задачах математики и программирования. Phi-4-Multimodal - это мультимодальная модель, объединяющая текст, изображения и речь/аудио в единую систему с использованием LoRA-адаптеров. Обе модели демонстрируют высокую эффективность несмотря на свой компактный размер, превосходя более крупные аналоги в различных задачах.'}, 'en': {'title': 'Compact Models, Superior Performance!', 'desc': 'The paper presents Phi-4-Mini and Phi-4-Multimodal, two advanced models designed for language and multimodal tasks. Phi-4-Mini, with 3.8 billion parameters, excels in math and coding tasks by utilizing a high-quality synthetic data approach and an expanded vocabulary of 200K tokens. Phi-4-Multimodal integrates text, vision, and audio inputs, employing innovative techniques like LoRA adapters for efficient multi-modal processing. Both models demonstrate superior performance compared to larger counterparts, showcasing their effectiveness in complex reasoning and diverse input scenarios.'}, 'zh': {'title': '紧凑强大的多模态模型Phi-4系列', 'desc': '我们介绍了Phi-4-Mini和Phi-4-Multimodal这两种紧凑而强大的语言和多模态模型。Phi-4-Mini是一个拥有38亿参数的语言模型,经过高质量的网络和合成数据训练,在数学和编码任务中表现优于同类开源模型,并且在复杂推理方面与两倍于其规模的模型相当。相比于前身Phi-3.5-Mini,Phi-4-Mini扩展了词汇量,支持多语言应用,并采用了组查询注意力机制以提高长序列生成的效率。Phi-4-Multimodal则是一个多模态模型,能够将文本、视觉和语音/音频输入整合到一个模型中,支持多种推理模式,且在多个任务上超越了更大的视觉-语言和语音-语言模型。'}}}, {'id': 'https://huggingface.co/papers/2503.01496', 'title': 'Liger: Linearizing Large Language Models to Gated Recurrent Structures', 'url': 'https://huggingface.co/papers/2503.01496', 'abstract': 'Transformers with linear recurrent modeling offer linear-time training and constant-memory inference. Despite their demonstrated efficiency and performance, pretraining such non-standard architectures from scratch remains costly and risky. The linearization of large language models (LLMs) transforms pretrained standard models into linear recurrent structures, enabling more efficient deployment. However, current linearization methods typically introduce additional feature map modules that require extensive fine-tuning and overlook the gating mechanisms used in state-of-the-art linear recurrent models. To address these issues, this paper presents Liger, short for Linearizing LLMs to gated recurrent structures. Liger is a novel approach for converting pretrained LLMs into gated linear recurrent models without adding extra parameters. It repurposes the pretrained key matrix weights to construct diverse gating mechanisms, facilitating the formation of various gated recurrent structures while avoiding the need to train additional components from scratch. Using lightweight fine-tuning with Low-Rank Adaptation (LoRA), Liger restores the performance of the linearized gated recurrent models to match that of the original LLMs. Additionally, we introduce Liger Attention, an intra-layer hybrid attention mechanism, which significantly recovers 93\\% of the Transformer-based LLM at 0.02\\% pre-training tokens during the linearization process, achieving competitive results across multiple benchmarks, as validated on models ranging from 1B to 8B parameters. Code is available at https://github.com/OpenSparseLLMs/Linearization.', 'score': 12, 'issue_id': 2514, 'pub_date': '2025-03-03', 'pub_date_card': {'ru': '3 марта', 'en': 'March 3', 'zh': '3月3日'}, 'hash': 'd5ca7ef45c0e90c9', 'authors': ['Disen Lan', 'Weigao Sun', 'Jiaxi Hu', 'Jusen Du', 'Yu Cheng'], 'affiliations': ['Nanjing University', 'Shanghai AI Laboratory', 'South China University of Technology', 'The Chinese University of Hong Kong', 'The Hong Kong University of Science and Technology (Guangzhou)'], 'pdf_title_img': 'assets/pdf/title_img/2503.01496.jpg', 'data': {'categories': ['#architecture', '#training', '#optimization', '#benchmark'], 'emoji': '🔢', 'ru': {'title': 'Эффективная линеаризация больших языковых моделей', 'desc': 'Данная статья представляет новый метод Liger для линеаризации больших языковых моделей (LLM) в гейтированные линейно-рекуррентные структуры. Liger преобразует предобученные LLM без добавления дополнительных параметров, используя существующие веса ключевой матрицы для создания различных механизмов гейтирования. Метод применяет легковесную донастройку с помощью Low-Rank Adaptation (LoRA) для восстановления производительности линеаризованных моделей. Авторы также представляют Liger Attention - гибридный механизм внимания, который значительно улучшает эффективность линеаризации.'}, 'en': {'title': 'Liger: Efficiently Transforming LLMs into Gated Linear Recurrent Models', 'desc': "This paper introduces Liger, a method for transforming pretrained large language models (LLMs) into gated linear recurrent models. Liger efficiently repurposes existing key matrix weights to create diverse gating mechanisms without adding extra parameters, thus avoiding the costly process of training new components from scratch. The approach employs lightweight fine-tuning techniques, specifically Low-Rank Adaptation (LoRA), to maintain the performance of the linearized models comparable to the original LLMs. Additionally, Liger incorporates a novel intra-layer hybrid attention mechanism, Liger Attention, which enhances the model's efficiency while achieving competitive results across various benchmarks."}, 'zh': {'title': 'Liger:高效转换预训练模型的创新方法', 'desc': '本文提出了一种名为Liger的方法,用于将预训练的大型语言模型(LLMs)转换为带门控的线性递归模型,而无需增加额外的参数。Liger通过重新利用预训练的关键矩阵权重,构建多样的门控机制,从而形成不同的门控递归结构。该方法使用轻量级的微调技术(如低秩适应LoRA),使线性化的门控递归模型的性能恢复到与原始LLMs相当的水平。此外,Liger Attention作为一种层内混合注意力机制,在线性化过程中显著恢复了93%的Transformer基础LLM的性能。'}}}, {'id': 'https://huggingface.co/papers/2502.18965', 'title': 'OneRec: Unifying Retrieve and Rank with Generative Recommender and Iterative Preference Alignment', 'url': 'https://huggingface.co/papers/2502.18965', 'abstract': "Recently, generative retrieval-based recommendation systems have emerged as a promising paradigm. However, most modern recommender systems adopt a retrieve-and-rank strategy, where the generative model functions only as a selector during the retrieval stage. In this paper, we propose OneRec, which replaces the cascaded learning framework with a unified generative model. To the best of our knowledge, this is the first end-to-end generative model that significantly surpasses current complex and well-designed recommender systems in real-world scenarios. Specifically, OneRec includes: 1) an encoder-decoder structure, which encodes the user's historical behavior sequences and gradually decodes the videos that the user may be interested in. We adopt sparse Mixture-of-Experts (MoE) to scale model capacity without proportionally increasing computational FLOPs. 2) a session-wise generation approach. In contrast to traditional next-item prediction, we propose a session-wise generation, which is more elegant and contextually coherent than point-by-point generation that relies on hand-crafted rules to properly combine the generated results. 3) an Iterative Preference Alignment module combined with Direct Preference Optimization (DPO) to enhance the quality of the generated results. Unlike DPO in NLP, a recommendation system typically has only one opportunity to display results for each user's browsing request, making it impossible to obtain positive and negative samples simultaneously. To address this limitation, We design a reward model to simulate user generation and customize the sampling strategy. Extensive experiments have demonstrated that a limited number of DPO samples can align user interest preferences and significantly improve the quality of generated results. We deployed OneRec in the main scene of Kuaishou, achieving a 1.6\\% increase in watch-time, which is a substantial improvement.", 'score': 9, 'issue_id': 2515, 'pub_date': '2025-02-26', 'pub_date_card': {'ru': '26 февраля', 'en': 'February 26', 'zh': '2月26日'}, 'hash': '21c5c80a138c98a0', 'authors': ['Jiaxin Deng', 'Shiyao Wang', 'Kuo Cai', 'Lejian Ren', 'Qigen Hu', 'Weifeng Ding', 'Qiang Luo', 'Guorui Zhou'], 'affiliations': ['KuaiShou Inc. Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2502.18965.jpg', 'data': {'categories': ['#alignment', '#rlhf', '#rag', '#games', '#training', '#optimization'], 'emoji': '🎥', 'ru': {'title': 'OneRec: Единая генеративная модель для революции в рекомендательных системах', 'desc': 'OneRec - это новая система рекомендаций, использующая единую генеративную модель вместо каскадного подхода. Она включает в себя структуру кодировщик-декодировщик с разреженной смесью экспертов (MoE) для масштабирования возможностей модели. OneRec применяет поэтапную генерацию сессий и модуль итеративного выравнивания предпочтений с прямой оптимизацией предпочтений (DPO). Система показала значительное улучшение времени просмотра при развертывании на платформе Kuaishou.'}, 'en': {'title': 'OneRec: Revolutionizing Recommendations with Generative Models', 'desc': 'This paper introduces OneRec, a novel generative retrieval-based recommendation system that improves upon traditional retrieve-and-rank methods. Unlike existing systems that use generative models merely for selection, OneRec employs a unified generative model that encodes user behavior and generates personalized video recommendations in a session-wise manner. The model utilizes a sparse Mixture-of-Experts architecture to enhance capacity while maintaining efficiency, and incorporates an Iterative Preference Alignment module to optimize user preferences effectively. Experimental results show that OneRec significantly outperforms existing systems, leading to a notable increase in user engagement metrics such as watch-time.'}, 'zh': {'title': 'OneRec:统一生成模型的推荐新范式', 'desc': '最近,基于生成检索的推荐系统成为一种有前景的范式。本文提出的OneRec模型,采用统一的生成模型,取代了传统的级联学习框架,能够在真实场景中显著超越现有复杂的推荐系统。OneRec包括编码-解码结构,能够有效编码用户的历史行为,并生成用户可能感兴趣的视频。此外,OneRec还引入了会话生成方法和迭代偏好对齐模块,提升了生成结果的质量,并在快手的实际应用中实现了观看时间的显著增加。'}}}, {'id': 'https://huggingface.co/papers/2503.01307', 'title': 'Cognitive Behaviors that Enable Self-Improving Reasoners, or, Four Habits of Highly Effective STaRs', 'url': 'https://huggingface.co/papers/2503.01307', 'abstract': "Test-time inference has emerged as a powerful paradigm for enabling language models to ``think'' longer and more carefully about complex challenges, much like skilled human experts. While reinforcement learning (RL) can drive self-improvement in language models on verifiable tasks, some models exhibit substantial gains while others quickly plateau. For instance, we find that Qwen-2.5-3B far exceeds Llama-3.2-3B under identical RL training for the game of Countdown. This discrepancy raises a critical question: what intrinsic properties enable effective self-improvement? We introduce a framework to investigate this question by analyzing four key cognitive behaviors -- verification, backtracking, subgoal setting, and backward chaining -- that both expert human problem solvers and successful language models employ. Our study reveals that Qwen naturally exhibits these reasoning behaviors, whereas Llama initially lacks them. In systematic experimentation with controlled behavioral datasets, we find that priming Llama with examples containing these reasoning behaviors enables substantial improvements during RL, matching or exceeding Qwen's performance. Importantly, the presence of reasoning behaviors, rather than correctness of answers, proves to be the critical factor -- models primed with incorrect solutions containing proper reasoning patterns achieve comparable performance to those trained on correct solutions. Finally, leveraging continued pretraining with OpenWebMath data, filtered to amplify reasoning behaviors, enables the Llama model to match Qwen's self-improvement trajectory. Our findings establish a fundamental relationship between initial reasoning behaviors and the capacity for improvement, explaining why some language models effectively utilize additional computation while others plateau.", 'score': 9, 'issue_id': 2511, 'pub_date': '2025-03-03', 'pub_date_card': {'ru': '3 марта', 'en': 'March 3', 'zh': '3月3日'}, 'hash': 'fa966620baa8c013', 'authors': ['Kanishk Gandhi', 'Ayush Chakravarthy', 'Anikait Singh', 'Nathan Lile', 'Noah D. Goodman'], 'affiliations': ['Stanford University', 'SynthLabs'], 'pdf_title_img': 'assets/pdf/title_img/2503.01307.jpg', 'data': {'categories': ['#training', '#optimization', '#rl', '#reasoning'], 'emoji': '🧠', 'ru': {'title': 'Когнитивные навыки - ключ к самосовершенствованию языковых моделей', 'desc': 'Исследование показывает, что способность языковых моделей к самосовершенствованию зависит от наличия у них определенных когнитивных поведений, таких как верификация, бэктрекинг, постановка подцелей и обратное планирование. Эксперименты выявили, что модель Qwen изначально обладает этими навыками, в то время как Llama нет. Прайминг Llama примерами, содержащими эти поведения, позволил значительно улучшить ее производительность при обучении с подкреплением. Важно отметить, что наличие правильных рассуждений оказалось более критичным фактором, чем корректность ответов.'}, 'en': {'title': 'Unlocking Self-Improvement in Language Models through Reasoning', 'desc': 'This paper explores how language models can improve their problem-solving abilities through a process called test-time inference, similar to human experts. It highlights the differences in performance between two models, Qwen-2.5-3B and Llama-3.2-3B, when trained with reinforcement learning (RL) on the game Countdown. The authors identify four cognitive behaviors—verification, backtracking, subgoal setting, and backward chaining—that are crucial for effective self-improvement in these models. They demonstrate that enhancing Llama with examples of these reasoning behaviors can significantly boost its performance, suggesting that the ability to reason is more important than simply providing correct answers.'}, 'zh': {'title': '推理行为是模型自我提升的关键', 'desc': '本文探讨了语言模型在复杂任务中自我改进的能力,特别是通过强化学习(RL)实现的自我提升。研究发现,不同模型在相同的RL训练下表现差异显著,例如Qwen-2.5-3B在游戏Countdown中远超Llama-3.2-3B。我们分析了四种关键的认知行为:验证、回溯、子目标设定和逆向链推理,发现Qwen自然展现了这些推理行为,而Llama则最初缺乏。通过对Llama进行示例引导,能够显著提升其在RL中的表现,证明了推理行为的存在是模型自我改进的关键因素。'}}}, {'id': 'https://huggingface.co/papers/2503.01183', 'title': 'DiffRhythm: Blazingly Fast and Embarrassingly Simple End-to-End Full-Length Song Generation with Latent Diffusion', 'url': 'https://huggingface.co/papers/2503.01183', 'abstract': 'Recent advancements in music generation have garnered significant attention, yet existing approaches face critical limitations. Some current generative models can only synthesize either the vocal track or the accompaniment track. While some models can generate combined vocal and accompaniment, they typically rely on meticulously designed multi-stage cascading architectures and intricate data pipelines, hindering scalability. Additionally, most systems are restricted to generating short musical segments rather than full-length songs. Furthermore, widely used language model-based methods suffer from slow inference speeds. To address these challenges, we propose DiffRhythm, the first latent diffusion-based song generation model capable of synthesizing complete songs with both vocal and accompaniment for durations of up to 4m45s in only ten seconds, maintaining high musicality and intelligibility. Despite its remarkable capabilities, DiffRhythm is designed to be simple and elegant: it eliminates the need for complex data preparation, employs a straightforward model structure, and requires only lyrics and a style prompt during inference. Additionally, its non-autoregressive structure ensures fast inference speeds. This simplicity guarantees the scalability of DiffRhythm. Moreover, we release the complete training code along with the pre-trained model on large-scale data to promote reproducibility and further research.', 'score': 8, 'issue_id': 2516, 'pub_date': '2025-03-03', 'pub_date_card': {'ru': '3 марта', 'en': 'March 3', 'zh': '3月3日'}, 'hash': '0370c6364610fd8e', 'authors': ['Ziqian Ning', 'Huakang Chen', 'Yuepeng Jiang', 'Chunbo Hao', 'Guobin Ma', 'Shuai Wang', 'Jixun Yao', 'Lei Xie'], 'affiliations': ['Northwestern Polytechnical University', 'Shenzhen Research Institute of Big Data, The Chinese University of Hong Kong, Shenzhen (CUHK-Shenzhen), China'], 'pdf_title_img': 'assets/pdf/title_img/2503.01183.jpg', 'data': {'categories': ['#diffusion', '#inference', '#dataset', '#open_source', '#audio'], 'emoji': '🎵', 'ru': {'title': 'DiffRhythm: Быстрая генерация полных песен с помощью латентной диффузии', 'desc': 'DiffRhythm - это первая модель генерации песен на основе латентной диффузии, способная синтезировать полные песни с вокалом и аккомпанементом длительностью до 4м45с всего за десять секунд. Модель имеет простую структуру, не требует сложной подготовки данных и использует только текст песни и стилевую подсказку при инференсе. Благодаря неавторегрессивной структуре, DiffRhythm обеспечивает высокую скорость генерации. Авторы опубликовали полный код обучения и предобученную модель для воспроизводимости результатов и дальнейших исследований.'}, 'en': {'title': 'DiffRhythm: Fast and Scalable Song Generation with Latent Diffusion', 'desc': "This paper introduces DiffRhythm, a novel music generation model that utilizes latent diffusion techniques to create full-length songs with both vocal and accompaniment tracks. Unlike existing models that are limited to short segments or require complex architectures, DiffRhythm simplifies the process by needing only lyrics and a style prompt for song generation. It achieves high musical quality and intelligibility while significantly improving inference speed, generating songs in just ten seconds. The authors also emphasize the model's scalability and reproducibility by providing the complete training code and pre-trained model for further research."}, 'zh': {'title': 'DiffRhythm:快速生成完整歌曲的创新模型', 'desc': '本论文介绍了一种新的音乐生成模型DiffRhythm,它能够在短短十秒内合成完整的歌曲,包括人声和伴奏,时长可达4分45秒。与现有模型相比,DiffRhythm采用潜在扩散技术,避免了复杂的数据准备和多阶段架构,确保了高效的推理速度。该模型只需歌词和风格提示即可生成音乐,具有良好的可扩展性。我们还发布了完整的训练代码和预训练模型,以促进研究的可重复性和进一步发展。'}}}, {'id': 'https://huggingface.co/papers/2503.00714', 'title': 'Speculative Ad-hoc Querying', 'url': 'https://huggingface.co/papers/2503.00714', 'abstract': "Analyzing large datasets requires responsive query execution, but executing SQL queries on massive datasets can be slow. This paper explores whether query execution can begin even before the user has finished typing, allowing results to appear almost instantly. We propose SpeQL, a system that leverages Large Language Models (LLMs) to predict likely queries based on the database schema, the user's past queries, and their incomplete query. Since exact query prediction is infeasible, SpeQL speculates on partial queries in two ways: 1) it predicts the query structure to compile and plan queries in advance, and 2) it precomputes smaller temporary tables that are much smaller than the original database, but are still predicted to contain all information necessary to answer the user's final query. Additionally, SpeQL continuously displays results for speculated queries and subqueries in real time, aiding exploratory analysis. A utility/user study showed that SpeQL improved task completion time, and participants reported that its speculative display of results helped them discover patterns in the data more quickly. In the study, SpeQL improves user's query latency by up to 289times and kept the overhead reasonable, at 4$ per hour.", 'score': 8, 'issue_id': 2514, 'pub_date': '2025-03-02', 'pub_date_card': {'ru': '2 марта', 'en': 'March 2', 'zh': '3月2日'}, 'hash': '1b0459b56fdb6894', 'authors': ['Haoyu Li', 'Srikanth Kandula', 'Maria Angels de Luis Balaguer', 'Aditya Akella', 'Venkat Arun'], 'affiliations': ['Amazon Web Services', 'Microsoft Research', 'The University of Texas at Austin'], 'pdf_title_img': 'assets/pdf/title_img/2503.00714.jpg', 'data': {'categories': ['#dataset', '#data', '#benchmark'], 'emoji': '⚡', 'ru': {'title': 'Молниеносные SQL-запросы с помощью предиктивной аналитики', 'desc': 'Статья представляет систему SpeQL, использующую большие языковые модели для предсказания SQL-запросов пользователя. SpeQL предугадывает структуру запроса и предварительно вычисляет временные таблицы, что позволяет начать выполнение запроса до его завершения пользователем. Система непрерывно отображает результаты предполагаемых запросов в реальном времени, помогая в исследовательском анализе данных. Исследование показало, что SpeQL значительно сокращает время выполнения задач и помогает пользователям быстрее обнаруживать закономерности в данных.'}, 'en': {'title': 'Instant Query Results with SpeQL!', 'desc': 'This paper introduces SpeQL, a novel system designed to enhance the speed of SQL query execution on large datasets. By utilizing Large Language Models (LLMs), SpeQL predicts user queries even before they are fully typed, allowing for near-instantaneous results. It employs two main strategies: predicting the structure of queries for pre-compilation and creating smaller temporary tables that contain essential data for answering the final query. A user study demonstrated that SpeQL significantly reduced query latency and helped users identify data patterns more efficiently during exploratory analysis.'}, 'zh': {'title': 'SpeQL:让查询更快的智能预测系统', 'desc': '本论文探讨了如何在用户输入SQL查询时,提前开始执行查询,以加快大数据集的查询响应速度。我们提出了SpeQL系统,利用大型语言模型(LLMs)根据数据库模式、用户的历史查询和不完整查询来预测可能的查询。SpeQL通过预测查询结构和预计算小型临时表来处理部分查询,从而在用户完成查询之前提供实时结果。研究表明,SpeQL显著提高了用户的查询速度,并帮助用户更快地发现数据中的模式。'}}}, {'id': 'https://huggingface.co/papers/2503.00784', 'title': 'DuoDecoding: Hardware-aware Heterogeneous Speculative Decoding with Dynamic Multi-Sequence Drafting', 'url': 'https://huggingface.co/papers/2503.00784', 'abstract': 'Large language models (LLMs) exhibit exceptional performance across a wide range of tasks; however, their token-by-token autoregressive generation process significantly hinders inference speed. Speculative decoding presents a promising draft-then-verify framework that reduces generation latency while maintaining output distribution fidelity. Nevertheless, the draft model introduces additional computational overhead, becoming a performance bottleneck and increasing the time to first token (TTFT). Previous approaches to mitigate draft model overhead have primarily relied on heuristics and generally failed to match the quality of the draft language models. To address these challenges, we propose DuoDecoding, a novel approach that strategically deploys the draft and target models on the CPU and GPU respectively, enabling parallel decoding while preserving draft quality. Our method incorporates a hardware-aware optimal draft budget to minimize idle times and employs dynamic multi-sequence drafting to enhance draft quality. Extensive experiments across seven tasks show that DuoDecoding achieves up to 2.61x speedup in generation latency, while reducing TTFT to 83% of that in conventional speculative decoding. The Code is available at https://github.com/KaiLv69/DuoDecoding.', 'score': 7, 'issue_id': 2510, 'pub_date': '2025-03-02', 'pub_date_card': {'ru': '2 марта', 'en': 'March 2', 'zh': '3月2日'}, 'hash': 'b4870a0e44c3cc55', 'authors': ['Kai Lv', 'Honglin Guo', 'Qipeng Guo', 'Xipeng Qiu'], 'affiliations': ['Fudan University', 'Shanghai AI Laboratory'], 'pdf_title_img': 'assets/pdf/title_img/2503.00784.jpg', 'data': {'categories': ['#inference', '#training', '#optimization'], 'emoji': '🚀', 'ru': {'title': 'DuoDecoding: Параллельное ускорение языковых моделей', 'desc': 'Статья представляет новый метод ускорения генерации текста большими языковыми моделями (LLM) под названием DuoDecoding. Этот подход использует параллельное декодирование на CPU и GPU, оптимизируя время генерации первого токена и общую латентность. DuoDecoding применяет аппаратно-ориентированный оптимальный бюджет черновика и динамическое многопоследовательное черновое декодирование для повышения качества. Эксперименты показали значительное ускорение генерации по сравнению с обычным спекулятивным декодированием.'}, 'en': {'title': 'DuoDecoding: Speeding Up Text Generation with Smart Model Deployment', 'desc': 'This paper introduces DuoDecoding, a new method to improve the speed of generating text with large language models (LLMs) while keeping the quality high. It uses a draft-then-verify approach, where a draft model quickly generates initial text, and a target model refines it, but does so in a way that reduces the time it takes to start generating text. By using both CPU and GPU for different parts of the process, DuoDecoding allows for faster and more efficient decoding. The results show that this method can significantly speed up text generation without sacrificing quality, achieving a notable improvement in performance across various tasks.'}, 'zh': {'title': 'DuoDecoding:加速生成的新方法', 'desc': '大型语言模型(LLMs)在多种任务中表现出色,但其逐字自回归生成过程显著影响推理速度。推测解码提供了一种有前景的草稿-验证框架,能够减少生成延迟,同时保持输出分布的准确性。我们提出的DuoDecoding方法通过在CPU和GPU上分别部署草稿模型和目标模型,实现了并行解码,提升了生成效率。实验结果表明,DuoDecoding在生成延迟上实现了最高2.61倍的加速,同时将首次生成时间缩短至传统推测解码的83%。'}}}, {'id': 'https://huggingface.co/papers/2503.00501', 'title': 'Qilin: A Multimodal Information Retrieval Dataset with APP-level User Sessions', 'url': 'https://huggingface.co/papers/2503.00501', 'abstract': "User-generated content (UGC) communities, especially those featuring multimodal content, improve user experiences by integrating visual and textual information into results (or items). The challenge of improving user experiences in complex systems with search and recommendation (S\\&R) services has drawn significant attention from both academia and industry these years. However, the lack of high-quality datasets has limited the research progress on multimodal S\\&R. To address the growing need for developing better S\\&R services, we present a novel multimodal information retrieval dataset in this paper, namely Qilin. The dataset is collected from Xiaohongshu, a popular social platform with over 300 million monthly active users and an average search penetration rate of over 70\\%. In contrast to existing datasets, Qilin offers a comprehensive collection of user sessions with heterogeneous results like image-text notes, video notes, commercial notes, and direct answers, facilitating the development of advanced multimodal neural retrieval models across diverse task settings. To better model user satisfaction and support the analysis of heterogeneous user behaviors, we also collect extensive APP-level contextual signals and genuine user feedback. Notably, Qilin contains user-favored answers and their referred results for search requests triggering the Deep Query Answering (DQA) module. This allows not only the training \\& evaluation of a Retrieval-augmented Generation (RAG) pipeline, but also the exploration of how such a module would affect users' search behavior. Through comprehensive analysis and experiments, we provide interesting findings and insights for further improving S\\&R systems. We hope that Qilin will significantly contribute to the advancement of multimodal content platforms with S\\&R services in the future.", 'score': 6, 'issue_id': 2513, 'pub_date': '2025-03-01', 'pub_date_card': {'ru': '1 марта', 'en': 'March 1', 'zh': '3月1日'}, 'hash': 'ed7fc8625b068597', 'authors': ['Jia Chen', 'Qian Dong', 'Haitao Li', 'Xiaohui He', 'Yan Gao', 'Shaosheng Cao', 'Yi Wu', 'Ping Yang', 'Chen Xu', 'Yao Hu', 'Qingyao Ai', 'Yiqun Liu'], 'affiliations': ['Tsinghua University', 'Xiaohongshu Inc.'], 'pdf_title_img': 'assets/pdf/title_img/2503.00501.jpg', 'data': {'categories': ['#multimodal', '#dataset', '#rag'], 'emoji': '🔍', 'ru': {'title': 'Qilin: мультимодальный датасет для улучшения поиска и рекомендаций', 'desc': 'Представлен новый набор данных Qilin для мультимодального информационного поиска, собранный на платформе Xiaohongshu. Датасет включает пользовательские сессии с разнородными результатами (изображения, видео, коммерческие заметки) и контекстуальными сигналами. Qilin позволяет обучать и оценивать нейросетевые модели поиска и рекомендаций, а также исследовать влияние модуля глубоких ответов на запросы. Авторы надеются, что Qilin внесет значительный вклад в развитие мультимодальных платформ с поисковыми сервисами.'}, 'en': {'title': 'Enhancing User Experiences with Qilin: A Multimodal Dataset for S&R Services', 'desc': 'This paper introduces Qilin, a new multimodal information retrieval dataset designed to enhance search and recommendation (S&R) services in user-generated content communities. Qilin is unique as it includes diverse user sessions with various content types, such as image-text notes and videos, which can help in developing advanced multimodal neural retrieval models. Additionally, the dataset captures user feedback and contextual signals, allowing researchers to analyze user satisfaction and behavior more effectively. The findings from this research aim to improve S&R systems and contribute to the evolution of multimodal content platforms.'}, 'zh': {'title': '推动多模态搜索与推荐服务的进步', 'desc': '本文介绍了一个新的多模态信息检索数据集Qilin,旨在改善用户在复杂系统中的搜索和推荐体验。Qilin数据集来源于小红书,包含多种类型的用户会话,如图文笔记、视频笔记和商业笔记,适用于多种任务设置。该数据集还收集了丰富的应用级上下文信号和真实用户反馈,以更好地建模用户满意度。通过对Qilin的分析和实验,本文提供了有趣的发现,期望能推动多模态内容平台的搜索和推荐服务的发展。'}}}, {'id': 'https://huggingface.co/papers/2503.01370', 'title': 'Kiss3DGen: Repurposing Image Diffusion Models for 3D Asset Generation', 'url': 'https://huggingface.co/papers/2503.01370', 'abstract': "Diffusion models have achieved great success in generating 2D images. However, the quality and generalizability of 3D content generation remain limited. State-of-the-art methods often require large-scale 3D assets for training, which are challenging to collect. In this work, we introduce Kiss3DGen (Keep It Simple and Straightforward in 3D Generation), an efficient framework for generating, editing, and enhancing 3D objects by repurposing a well-trained 2D image diffusion model for 3D generation. Specifically, we fine-tune a diffusion model to generate ''3D Bundle Image'', a tiled representation composed of multi-view images and their corresponding normal maps. The normal maps are then used to reconstruct a 3D mesh, and the multi-view images provide texture mapping, resulting in a complete 3D model. This simple method effectively transforms the 3D generation problem into a 2D image generation task, maximizing the utilization of knowledge in pretrained diffusion models. Furthermore, we demonstrate that our Kiss3DGen model is compatible with various diffusion model techniques, enabling advanced features such as 3D editing, mesh and texture enhancement, etc. Through extensive experiments, we demonstrate the effectiveness of our approach, showcasing its ability to produce high-quality 3D models efficiently.", 'score': 4, 'issue_id': 2513, 'pub_date': '2025-03-03', 'pub_date_card': {'ru': '3 марта', 'en': 'March 3', 'zh': '3月3日'}, 'hash': '3decc9fe2b6f6e32', 'pdf_title_img': 'img/title_stub.png', 'data': {'categories': ['#cv', '#diffusion', '#3d'], 'emoji': '🎨', 'ru': {'title': 'Простая и эффективная 3D-генерация на основе 2D-диффузии', 'desc': "Статья представляет Kiss3DGen - эффективный фреймворк для генерации, редактирования и улучшения 3D-объектов с использованием предобученной модели диффузии для 2D-изображений. Метод основан на дообучении диффузионной модели для генерации 'Пакетного 3D-изображения', состоящего из мультиракурсных изображений и соответствующих карт нормалей. Затем карты нормалей используются для реконструкции 3D-меша, а мультиракурсные изображения обеспечивают текстурирование, что в результате дает полную 3D-модель. Авторы демонстрируют, что их подход совместим с различными техниками диффузионных моделей и позволяет эффективно создавать качественные 3D-модели."}, 'en': {'title': 'Kiss3DGen: Simplifying 3D Generation with 2D Diffusion Models', 'desc': "This paper presents Kiss3DGen, a novel framework that simplifies the process of generating and enhancing 3D objects by leveraging existing 2D image diffusion models. The approach involves fine-tuning a diffusion model to create a '3D Bundle Image', which consists of multiple views and normal maps that are essential for 3D reconstruction. By transforming the 3D generation challenge into a 2D image task, the method maximizes the use of knowledge from pretrained models, making it more efficient. The results show that Kiss3DGen not only generates high-quality 3D models but also supports advanced features like editing and texture enhancement."}, 'zh': {'title': '简单高效的三维生成方法', 'desc': '扩散模型在生成二维图像方面取得了巨大成功,但在三维内容生成的质量和通用性上仍然有限。现有的先进方法通常需要大量的三维资产进行训练,这些资产难以收集。我们提出了Kiss3DGen(简单直接的三维生成),这是一个高效的框架,通过重新利用经过良好训练的二维图像扩散模型来生成、编辑和增强三维物体。该方法将三维生成问题转化为二维图像生成任务,最大化利用预训练扩散模型中的知识,能够有效生成高质量的三维模型。'}}}, {'id': 'https://huggingface.co/papers/2503.01714', 'title': "Word Form Matters: LLMs' Semantic Reconstruction under Typoglycemia", 'url': 'https://huggingface.co/papers/2503.01714', 'abstract': "Human readers can efficiently comprehend scrambled words, a phenomenon known as Typoglycemia, primarily by relying on word form; if word form alone is insufficient, they further utilize contextual cues for interpretation. While advanced large language models (LLMs) exhibit similar abilities, the underlying mechanisms remain unclear. To investigate this, we conduct controlled experiments to analyze the roles of word form and contextual information in semantic reconstruction and examine LLM attention patterns. Specifically, we first propose SemRecScore, a reliable metric to quantify the degree of semantic reconstruction, and validate its effectiveness. Using this metric, we study how word form and contextual information influence LLMs' semantic reconstruction ability, identifying word form as the core factor in this process. Furthermore, we analyze how LLMs utilize word form and find that they rely on specialized attention heads to extract and process word form information, with this mechanism remaining stable across varying levels of word scrambling. This distinction between LLMs' fixed attention patterns primarily focused on word form and human readers' adaptive strategy in balancing word form and contextual information provides insights into enhancing LLM performance by incorporating human-like, context-aware mechanisms.", 'score': 3, 'issue_id': 2517, 'pub_date': '2025-03-03', 'pub_date_card': {'ru': '3 марта', 'en': 'March 3', 'zh': '3月3日'}, 'hash': '4880ed4c044081c4', 'authors': ['Chenxi Wang', 'Tianle Gu', 'Zhongyu Wei', 'Lang Gao', 'Zirui Song', 'Xiuying Chen'], 'affiliations': ['Fudan University', 'Mohamed bin Zayed University of Artificial Intelligence (MBZUAI)'], 'pdf_title_img': 'assets/pdf/title_img/2503.01714.jpg', 'data': {'categories': ['#training', '#interpretability', '#data', '#multimodal', '#alignment'], 'emoji': '🔀', 'ru': {'title': 'Форма слова - ключ к пониманию перемешанного текста для ИИ', 'desc': 'Исследование посвящено способности больших языковых моделей (LLM) понимать перемешанные слова, подобно людям. Авторы предлагают метрику SemRecScore для оценки семантической реконструкции и анализируют роль формы слова и контекстной информации. Результаты показывают, что форма слова является ключевым фактором для LLM при обработке перемешанных слов. Анализ паттернов внимания LLM выявляет специализированные механизмы для извлечения информации о форме слова.'}, 'en': {'title': 'Unlocking LLMs: The Power of Word Form in Understanding Scrambled Text', 'desc': 'This paper explores how large language models (LLMs) understand scrambled words, similar to how humans do through a phenomenon called Typoglycemia. The authors introduce a new metric, SemRecScore, to measure how well LLMs can reconstruct meaning from scrambled text by focusing on word form and context. Their experiments reveal that LLMs primarily depend on word form for semantic reconstruction, utilizing specific attention heads to process this information. The findings suggest that incorporating more human-like, context-aware strategies could improve LLM performance in understanding language.'}, 'zh': {'title': '揭示大型语言模型的语义重建机制', 'desc': '本研究探讨了大型语言模型(LLMs)在语义重建中的能力,特别是它们如何利用单词形式和上下文信息。我们提出了一种新的度量标准SemRecScore,用于量化语义重建的程度,并验证了其有效性。研究发现,单词形式是影响LLMs语义重建能力的核心因素,且LLMs通过专门的注意力头来提取和处理单词形式信息。与人类读者在单词形式和上下文信息之间的灵活策略不同,LLMs的注意力模式主要集中在单词形式上,这为提升LLMs性能提供了新的思路。'}}}, {'id': 'https://huggingface.co/papers/2502.18890', 'title': 'From Hours to Minutes: Lossless Acceleration of Ultra Long Sequence Generation up to 100K Tokens', 'url': 'https://huggingface.co/papers/2502.18890', 'abstract': "Generating ultra-long sequences with large language models (LLMs) has become increasingly crucial but remains a highly time-intensive task, particularly for sequences up to 100K tokens. While traditional speculative decoding methods exist, simply extending their generation limits fails to accelerate the process and can be detrimental. Through an in-depth analysis, we identify three major challenges hindering efficient generation: frequent model reloading, dynamic key-value (KV) management and repetitive generation. To address these issues, we introduce TOKENSWIFT, a novel framework designed to substantially accelerate the generation process of ultra-long sequences while maintaining the target model's inherent quality. Experimental results demonstrate that TOKENSWIFT achieves over 3 times speedup across models of varying scales (1.5B, 7B, 8B, 14B) and architectures (MHA, GQA). This acceleration translates to hours of time savings for ultra-long sequence generation, establishing TOKENSWIFT as a scalable and effective solution at unprecedented lengths. Code can be found at https://github.com/bigai-nlco/TokenSwift.", 'score': 2, 'issue_id': 2517, 'pub_date': '2025-02-26', 'pub_date_card': {'ru': '26 февраля', 'en': 'February 26', 'zh': '2月26日'}, 'hash': 'd07c05abfac49ecc', 'authors': ['Tong Wu', 'Junzhe Shen', 'Zixia Jia', 'Yuxuan Wang', 'Zilong Zheng'], 'affiliations': ['NLCo Lab, BIGAI LUMIA Lab, Shanghai Jiao Tong University'], 'pdf_title_img': 'assets/pdf/title_img/2502.18890.jpg', 'data': {'categories': ['#training', '#architecture', '#long_context', '#inference', '#optimization'], 'emoji': '⚡', 'ru': {'title': 'TOKENSWIFT: революция в скорости генерации сверхдлинных текстов', 'desc': 'Исследователи представили TOKENSWIFT - новую систему для ускорения генерации сверхдлинных последовательностей большими языковыми моделями (LLM). Они выявили три основные проблемы, препятствующие эффективной генерации: частая перезагрузка модели, динамическое управление ключами-значениями и повторяющаяся генерация. TOKENSWIFT решает эти проблемы, позволяя ускорить процесс генерации в 3 раза для моделей различных масштабов и архитектур. Это существенно сокращает время генерации сверхдлинных последовательностей, сохраняя при этом качество целевой модели.'}, 'en': {'title': 'Accelerating Ultra-Long Sequence Generation with TOKENSWIFT', 'desc': 'This paper presents TOKENSWIFT, a new framework aimed at speeding up the generation of ultra-long sequences using large language models (LLMs). The authors identify key challenges such as model reloading, dynamic key-value management, and repetitive generation that slow down the process. By addressing these issues, TOKENSWIFT achieves over three times the speed of traditional methods while preserving the quality of the generated text. Experimental results show that this framework is effective across various model sizes and architectures, making it a significant advancement in the field of sequence generation.'}, 'zh': {'title': 'TOKENSWIFT:加速超长序列生成的创新框架', 'desc': '生成超长序列对于大型语言模型(LLMs)变得越来越重要,但这一过程通常非常耗时,尤其是对于长达10万标记的序列。传统的推测解码方法在延长生成限制时并未加速过程,反而可能造成负面影响。我们通过深入分析,识别出影响高效生成的三个主要挑战:频繁的模型重载、动态键值(KV)管理和重复生成。为了解决这些问题,我们提出了TOKENSWIFT,一个新框架,旨在显著加快超长序列的生成过程,同时保持目标模型的固有质量。'}}}, {'id': 'https://huggingface.co/papers/2502.16779', 'title': 'Unposed Sparse Views Room Layout Reconstruction in the Age of Pretrain Model', 'url': 'https://huggingface.co/papers/2502.16779', 'abstract': 'Room layout estimation from multiple-perspective images is poorly investigated due to the complexities that emerge from multi-view geometry, which requires muti-step solutions such as camera intrinsic and extrinsic estimation, image matching, and triangulation. However, in 3D reconstruction, the advancement of recent 3D foundation models such as DUSt3R has shifted the paradigm from the traditional multi-step structure-from-motion process to an end-to-end single-step approach. To this end, we introduce Plane-DUSt3R, a novel method for multi-view room layout estimation leveraging the 3D foundation model DUSt3R. Plane-DUSt3R incorporates the DUSt3R framework and fine-tunes on a room layout dataset (Structure3D) with a modified objective to estimate structural planes. By generating uniform and parsimonious results, Plane-DUSt3R enables room layout estimation with only a single post-processing step and 2D detection results. Unlike previous methods that rely on single-perspective or panorama image, Plane-DUSt3R extends the setting to handle multiple-perspective images. Moreover, it offers a streamlined, end-to-end solution that simplifies the process and reduces error accumulation. Experimental results demonstrate that Plane-DUSt3R not only outperforms state-of-the-art methods on the synthetic dataset but also proves robust and effective on in the wild data with different image styles such as cartoon.Our code is available at: https://github.com/justacar/Plane-DUSt3R', 'score': 2, 'issue_id': 2516, 'pub_date': '2025-02-24', 'pub_date_card': {'ru': '24 февраля', 'en': 'February 24', 'zh': '2月24日'}, 'hash': '4a9f6cc2fb1ab840', 'authors': ['Yaxuan Huang', 'Xili Dai', 'Jianan Wang', 'Xianbiao Qi', 'Yixing Yuan', 'Xiangyu Yue'], 'affiliations': ['Astribot', 'Hong Kong Center for Construction Robotics, The Hong Kong University of Science and Technology', 'Intellifusion Inc.', 'MMLab, The Chinese University of Hong Kong', 'The Hong Kong University of Science and Technology (Guangzhou)'], 'pdf_title_img': 'assets/pdf/title_img/2502.16779.jpg', 'data': {'categories': ['#3d', '#optimization', '#cv', '#synthetic'], 'emoji': '🏠', 'ru': {'title': 'Революция в оценке планировки помещений: от множества шагов к единому решению', 'desc': 'Статья представляет Plane-DUSt3R - новый метод для оценки планировки помещений по множественным ракурсам изображений. Этот подход использует 3D-модель фундаментального уровня DUSt3R и дообучается на наборе данных Structure3D для оценки структурных плоскостей. Plane-DUSt3R предлагает упрощенное сквозное решение, которое превосходит современные методы на синтетических данных и показывает надежность на реальных изображениях различных стилей. Метод позволяет оценивать планировку помещений с помощью одного шага постобработки и результатов 2D-обнаружения.'}, 'en': {'title': 'Revolutionizing Room Layout Estimation with Plane-DUSt3R', 'desc': 'This paper presents Plane-DUSt3R, a new method for estimating room layouts from multiple images taken from different perspectives. It builds on the DUSt3R 3D foundation model, moving away from traditional multi-step processes to a more efficient end-to-end approach. By fine-tuning the model on a specific dataset, Plane-DUSt3R can accurately identify structural planes with minimal post-processing. The results show that this method not only surpasses existing techniques on synthetic data but also performs well on real-world images with varying styles.'}, 'zh': {'title': '简化多视角房间布局估计的全新方法', 'desc': '本论文提出了一种新的多视角房间布局估计方法,称为Plane-DUSt3R。该方法利用了先进的3D基础模型DUSt3R,简化了传统的多步骤流程,采用端到端的单步骤方法。Plane-DUSt3R通过在房间布局数据集上进行微调,能够有效估计结构平面,并生成一致且简洁的结果。实验结果表明,Plane-DUSt3R在合成数据集上超越了现有的最先进方法,并在不同风格的真实数据上表现出色。'}}}, {'id': 'https://huggingface.co/papers/2503.01295', 'title': 'CodeArena: A Collective Evaluation Platform for LLM Code Generation', 'url': 'https://huggingface.co/papers/2503.01295', 'abstract': 'Large Language Models (LLMs) have reshaped code generation by synergizing their exceptional comprehension of natural language and programming syntax, thereby substantially boosting developer productivity. These advancements have prompted numerous efforts to quantitatively evaluate their coding capabilities. However, persistent challenges, such as benchmark leakage, data dissipation, and limited system accessibility, continue to impede a timely and accurate assessment. To address these limitations, we introduce CodeArena, an online evaluation framework tailored for LLM code generation. The key innovation is a collective evaluation mechanism, which dynamically recalibrates individual model scores based on the holistic performance of all participating models, mitigating score biases caused by widespread benchmark leakage. In addition, CodeArena ensures open access to all submitted solutions and test cases and provides automation-friendly APIs to streamline the code evaluation workflow. Our main contributions are: (1) a collective evaluation system for unbiased assessment, (2) a public repository of solutions and test cases, and (3) automation-ready APIs for seamless integration.', 'score': 2, 'issue_id': 2514, 'pub_date': '2025-03-03', 'pub_date_card': {'ru': '3 марта', 'en': 'March 3', 'zh': '3月3日'}, 'hash': '96f50dd9e636b12e', 'authors': ['Mingzhe Du', 'Anh Tuan Luu', 'Bin Ji', 'Xiaobao Wu', 'Dong Huang', 'Terry Yue Zhuo', 'Qian Liu', 'See-Kiong Ng'], 'affiliations': ['ByteDance', 'Monash University', 'Nanyang Technological University', 'National University of Singapore', 'The University of Hong Kong'], 'pdf_title_img': 'assets/pdf/title_img/2503.01295.jpg', 'data': {'categories': ['#dataset', '#benchmark', '#leakage', '#open_source'], 'emoji': '🏟️', 'ru': {'title': 'CodeArena: Справедливая арена для оценки LLM в генерации кода', 'desc': 'CodeArena - это новая онлайн-платформа для оценки генерации кода большими языковыми моделями (LLM). Она использует коллективный механизм оценки, который динамически пересчитывает индивидуальные оценки моделей на основе общей производительности всех участвующих моделей. Это помогает снизить искажения оценок, вызванные утечкой тестовых данных. CodeArena также предоставляет открытый доступ ко всем отправленным решениям и тестовым случаям, а также API для автоматизации процесса оценки.'}, 'en': {'title': 'Revolutionizing Code Evaluation with CodeArena', 'desc': 'This paper discusses the impact of Large Language Models (LLMs) on code generation, highlighting their ability to understand both natural language and programming syntax, which enhances developer productivity. It identifies ongoing issues in evaluating LLM coding capabilities, such as benchmark leakage and limited access to evaluation systems. To overcome these challenges, the authors present CodeArena, an online framework that offers a collective evaluation mechanism to provide unbiased assessments of LLMs. CodeArena also features a public repository for solutions and test cases, along with APIs for easy integration into existing workflows.'}, 'zh': {'title': 'CodeArena:公平评估LLM代码生成的创新平台', 'desc': '大型语言模型(LLMs)通过结合对自然语言和编程语法的深刻理解,极大地提升了代码生成的效率,进而提高了开发者的生产力。为了量化评估这些模型的编码能力,许多研究工作应运而生,但仍面临基准泄漏、数据消散和系统可访问性有限等挑战。为了解决这些问题,我们提出了CodeArena,这是一个专为LLM代码生成设计的在线评估框架。其核心创新在于集体评估机制,能够根据所有参与模型的整体表现动态调整个别模型的评分,从而减少因基准泄漏造成的评分偏差。'}}}, {'id': 'https://huggingface.co/papers/2503.01739', 'title': 'VideoUFO: A Million-Scale User-Focused Dataset for Text-to-Video Generation', 'url': 'https://huggingface.co/papers/2503.01739', 'abstract': "Text-to-video generative models convert textual prompts into dynamic visual content, offering wide-ranging applications in film production, gaming, and education. However, their real-world performance often falls short of user expectations. One key reason is that these models have not been trained on videos related to some topics users want to create. In this paper, we propose VideoUFO, the first Video dataset specifically curated to align with Users' FOcus in real-world scenarios. Beyond this, our VideoUFO also features: (1) minimal (0.29%) overlap with existing video datasets, and (2) videos searched exclusively via YouTube's official API under the Creative Commons license. These two attributes provide future researchers with greater freedom to broaden their training sources. The VideoUFO comprises over 1.09 million video clips, each paired with both a brief and a detailed caption (description). Specifically, through clustering, we first identify 1,291 user-focused topics from the million-scale real text-to-video prompt dataset, VidProM. Then, we use these topics to retrieve videos from YouTube, split the retrieved videos into clips, and generate both brief and detailed captions for each clip. After verifying the clips with specified topics, we are left with about 1.09 million video clips. Our experiments reveal that (1) current 16 text-to-video models do not achieve consistent performance across all user-focused topics; and (2) a simple model trained on VideoUFO outperforms others on worst-performing topics. The dataset is publicly available at https://huggingface.co/datasets/WenhaoWang/VideoUFO under the CC BY 4.0 License.", 'score': 2, 'issue_id': 2512, 'pub_date': '2025-03-03', 'pub_date_card': {'ru': '3 марта', 'en': 'March 3', 'zh': '3月3日'}, 'hash': '046fdeee8939e82c', 'authors': ['Wenhao Wang', 'Yi Yang'], 'affiliations': ['University of Technology Sydney', 'Zhejiang University'], 'pdf_title_img': 'assets/pdf/title_img/2503.01739.jpg', 'data': {'categories': ['#video', '#dataset', '#data', '#games', '#open_source'], 'emoji': '🎬', 'ru': {'title': 'VideoUFO: Новый эталонный датасет для генерации видео по запросу', 'desc': 'Статья представляет VideoUFO - новый набор данных для обучения моделей генерации видео по текстовому описанию. Этот датасет содержит более 1,09 миллиона видеоклипов с подробными и краткими описаниями, охватывающих 1291 тему, актуальную для пользователей. VideoUFO отличается минимальным пересечением с существующими наборами данных и использованием только видео под лицензией Creative Commons. Эксперименты показали, что простая модель, обученная на VideoUFO, превосходит другие модели на наиболее сложных темах.'}, 'en': {'title': 'Empowering Text-to-Video Models with User-Focused Data', 'desc': 'This paper introduces VideoUFO, a novel video dataset designed to enhance text-to-video generative models by focusing on user-relevant topics. The dataset contains over 1.09 million video clips, each accompanied by both brief and detailed captions, ensuring minimal overlap with existing datasets. By clustering user prompts, the authors identified 1,291 specific topics to guide video retrieval from YouTube, which were then segmented into clips. Experiments show that models trained on VideoUFO significantly outperform existing models, particularly on challenging topics, highlighting the importance of tailored training data in machine learning applications.'}, 'zh': {'title': '提升文本到视频生成的用户体验', 'desc': '本文介绍了一种新的视频数据集VideoUFO,旨在提高文本到视频生成模型的性能。该数据集专注于用户关注的主题,包含超过109万个视频片段,并为每个片段提供简短和详细的描述。VideoUFO与现有数据集的重叠率极低,且所有视频均通过YouTube的官方API获取,确保了数据的多样性和合法性。实验结果表明,使用VideoUFO训练的模型在用户关注的主题上表现优于其他模型。'}}}, {'id': 'https://huggingface.co/papers/2503.01807', 'title': 'Large-Scale Data Selection for Instruction Tuning', 'url': 'https://huggingface.co/papers/2503.01807', 'abstract': 'Selecting high-quality training data from a larger pool is a crucial step when instruction-tuning language models, as carefully curated datasets often produce models that outperform those trained on much larger, noisier datasets. Automated data selection approaches for instruction-tuning are typically tested by selecting small datasets (roughly 10k samples) from small pools (100-200k samples). However, popular deployed instruction-tuned models often train on hundreds of thousands to millions of samples, subsampled from even larger data pools. We present a systematic study of how well data selection methods scale to these settings, selecting up to 2.5M samples from pools of up to 5.8M samples and evaluating across 7 diverse tasks. We show that many recently proposed methods fall short of random selection in this setting (while using more compute), and even decline in performance when given access to larger pools of data to select over. However, we find that a variant of representation-based data selection (RDS+), which uses weighted mean pooling of pretrained LM hidden states, consistently outperforms more complex methods across all settings tested -- all whilst being more compute-efficient. Our findings highlight that the scaling properties of proposed automated selection methods should be more closely examined. We release our code, data, and models at https://github.com/hamishivi/automated-instruction-selection.', 'score': 2, 'issue_id': 2511, 'pub_date': '2025-03-03', 'pub_date_card': {'ru': '3 марта', 'en': 'March 3', 'zh': '3月3日'}, 'hash': '8bbc980a9ef867f7', 'authors': ['Hamish Ivison', 'Muru Zhang', 'Faeze Brahman', 'Pang Wei Koh', 'Pradeep Dasigi'], 'affiliations': ['Allen Institute for AI', 'University of Southern California', 'University of Washington'], 'pdf_title_img': 'assets/pdf/title_img/2503.01807.jpg', 'data': {'categories': ['#data', '#open_source', '#optimization', '#dataset', '#training'], 'emoji': '🔍', 'ru': {'title': 'Эффективный отбор данных для обучения языковых моделей: меньше значит больше', 'desc': 'Эта статья исследует методы автоматического отбора данных для инструктивной настройки языковых моделей. Авторы проводят систематическое изучение эффективности различных методов при масштабировании до больших объемов данных, выбирая до 2,5 миллионов образцов из пулов до 5,8 миллионов. Результаты показывают, что многие недавно предложенные методы уступают случайному отбору в этих условиях, однако вариант метода отбора на основе представлений (RDS+) превосходит более сложные подходы. Исследование подчеркивает важность тщательного анализа масштабируемости методов автоматического отбора данных.'}, 'en': {'title': 'Quality Over Quantity: Smart Data Selection for Language Models', 'desc': 'This paper investigates the importance of selecting high-quality training data for instruction-tuning language models. It reveals that many automated data selection methods do not perform better than random selection when scaling to larger datasets, which can include millions of samples. The study introduces a representation-based data selection method (RDS+) that consistently outperforms more complex approaches while being more efficient in terms of computational resources. The authors emphasize the need for a deeper examination of how these selection methods behave as the size of the data pools increases.'}, 'zh': {'title': '高效选择:优化语言模型训练数据的关键', 'desc': '在对语言模型进行指令调优时,从更大数据集中选择高质量的训练数据是一个关键步骤。经过精心策划的数据集通常能产生比那些在更大、更嘈杂的数据集上训练的模型更好的效果。我们进行了系统研究,评估数据选择方法在大规模数据集上的表现,发现许多新提出的方法在这种情况下的表现不如随机选择。我们还发现一种基于表示的数据选择变体(RDS+)在所有测试设置中始终优于更复杂的方法,同时计算效率更高。'}}}, {'id': 'https://huggingface.co/papers/2503.01506', 'title': 'SampleMix: A Sample-wise Pre-training Data Mixing Strategey by Coordinating Data Quality and Diversity', 'url': 'https://huggingface.co/papers/2503.01506', 'abstract': "Existing pretraining data mixing methods for large language models (LLMs) typically follow a domain-wise methodology, a top-down process that first determines domain weights and then performs uniform data sampling across each domain. However, these approaches neglect significant inter-domain overlaps and commonalities, failing to control the global diversity of the constructed training dataset. Further, uniform sampling within domains ignores fine-grained sample-specific features, potentially leading to suboptimal data distribution. To address these shortcomings, we propose a novel sample-wise data mixture approach based on a bottom-up paradigm. This method performs global cross-domain sampling by systematically evaluating the quality and diversity of each sample, thereby dynamically determining the optimal domain distribution. Comprehensive experiments across multiple downstream tasks and perplexity assessments demonstrate that SampleMix surpasses existing domain-based methods. Meanwhile, SampleMix requires 1.4x to 2.1x training steps to achieves the baselines' performance, highlighting the substantial potential of SampleMix to optimize pre-training data.", 'score': 1, 'issue_id': 2517, 'pub_date': '2025-03-03', 'pub_date_card': {'ru': '3 марта', 'en': 'March 3', 'zh': '3月3日'}, 'hash': '018cc621eb1ee12b', 'authors': ['Xiangyu Xi', 'Deyang Kong', 'Jian Yang', 'Jiawei Yang', 'Zhengyu Chen', 'Wei Wang', 'Jingang Wang', 'Xunliang Cai', 'Shikun Zhang', 'Wei Ye'], 'affiliations': ['Meituan Group, Beijing, China', 'National Engineering Research Center for Software Engineering, Peking University, Beijing, China'], 'pdf_title_img': 'assets/pdf/title_img/2503.01506.jpg', 'data': {'categories': ['#transfer_learning', '#training', '#optimization', '#data'], 'emoji': '🔀', 'ru': {'title': 'SampleMix: революция в смешивании данных для LLM', 'desc': 'В статье представлен новый подход к смешиванию предобучающих данных для больших языковых моделей (LLM), названный SampleMix. В отличие от традиционных методов, основанных на доменах, SampleMix использует выборку на уровне отдельных образцов, оценивая их качество и разнообразие. Этот метод позволяет динамически определять оптимальное распределение доменов и учитывать межdomенные пересечения. Эксперименты показали, что SampleMix превосходит существующие методы, основанные на доменах, хотя и требует больше шагов обучения.'}, 'en': {'title': 'Revolutionizing Data Mixing for Better Language Model Training', 'desc': 'This paper introduces SampleMix, a new method for mixing pretraining data for large language models (LLMs). Unlike traditional domain-wise approaches that sample uniformly within predefined domains, SampleMix uses a bottom-up strategy that evaluates the quality and diversity of individual samples across domains. This allows for a more dynamic and optimal distribution of training data, addressing the limitations of inter-domain overlaps and sample-specific features. Experimental results show that SampleMix not only outperforms existing methods but also requires fewer training steps to achieve comparable performance.'}, 'zh': {'title': '样本级数据混合,优化预训练数据的未来', 'desc': '现有的大型语言模型预训练数据混合方法通常采用领域导向的方法,先确定领域权重,再在每个领域内进行均匀数据采样。然而,这些方法忽视了领域之间的重要重叠和共性,未能有效控制训练数据集的全球多样性。此外,领域内的均匀采样忽略了样本特定的细微特征,可能导致数据分布不理想。为了解决这些问题,我们提出了一种基于自下而上的新型样本级数据混合方法,能够通过系统评估每个样本的质量和多样性,动态确定最佳领域分布。'}}}, {'id': 'https://huggingface.co/papers/2503.01103', 'title': 'Direct Discriminative Optimization: Your Likelihood-Based Visual Generative Model is Secretly a GAN Discriminator', 'url': 'https://huggingface.co/papers/2503.01103', 'abstract': 'While likelihood-based generative models, particularly diffusion and autoregressive models, have achieved remarkable fidelity in visual generation, the maximum likelihood estimation (MLE) objective inherently suffers from a mode-covering tendency that limits the generation quality under limited model capacity. In this work, we propose Direct Discriminative Optimization (DDO) as a unified framework that bridges likelihood-based generative training and the GAN objective to bypass this fundamental constraint. Our key insight is to parameterize a discriminator implicitly using the likelihood ratio between a learnable target model and a fixed reference model, drawing parallels with the philosophy of Direct Preference Optimization (DPO). Unlike GANs, this parameterization eliminates the need for joint training of generator and discriminator networks, allowing for direct, efficient, and effective finetuning of a well-trained model to its full potential beyond the limits of MLE. DDO can be performed iteratively in a self-play manner for progressive model refinement, with each round requiring less than 1% of pretraining epochs. Our experiments demonstrate the effectiveness of DDO by significantly advancing the previous SOTA diffusion model EDM, reducing FID scores from 1.79/1.58 to new records of 1.30/0.97 on CIFAR-10/ImageNet-64 datasets, and by consistently improving both guidance-free and CFG-enhanced FIDs of visual autoregressive models on ImageNet 256times256.', 'score': 0, 'issue_id': 2517, 'pub_date': '2025-03-03', 'pub_date_card': {'ru': '3 марта', 'en': 'March 3', 'zh': '3月3日'}, 'hash': 'd8b58c1a2c49da16', 'authors': ['Kaiwen Zheng', 'Yongxin Chen', 'Huayu Chen', 'Guande He', 'Ming-Yu Liu', 'Jun Zhu', 'Qinsheng Zhang'], 'affiliations': ['NVIDIA', 'The University of Texas at', 'Tsinghua University'], 'pdf_title_img': 'assets/pdf/title_img/2503.01103.jpg', 'data': {'categories': ['#training', '#diffusion', '#cv', '#optimization'], 'emoji': '🚀', 'ru': {'title': 'DDO: Прорыв в обучении генеративных моделей без ограничений MLE', 'desc': 'Авторы статьи предлагают новый метод обучения генеративных моделей под названием Direct Discriminative Optimization (DDO). Этот подход объединяет методы обучения на основе правдоподобия и цели генеративно-состязательных сетей (GAN), чтобы преодолеть ограничения метода максимального правдоподобия (MLE). DDO использует отношение правдоподобия между обучаемой целевой моделью и фиксированной эталонной моделью для параметризации дискриминатора. Эксперименты показывают, что DDO значительно улучшает результаты современных диффузионных и авторегрессионных моделей на различных наборах данных.'}, 'en': {'title': 'Enhancing Generative Models with Direct Discriminative Optimization', 'desc': 'This paper introduces Direct Discriminative Optimization (DDO), a new framework that enhances the performance of generative models by combining likelihood-based training with concepts from Generative Adversarial Networks (GANs). DDO addresses the limitations of maximum likelihood estimation (MLE) by using a discriminator that is parameterized through the likelihood ratio of a target model and a fixed reference model. This approach allows for efficient finetuning of pre-trained models without the need for joint training of generator and discriminator networks. The results show that DDO significantly improves the state-of-the-art performance in visual generation tasks, achieving lower FID scores on popular datasets like CIFAR-10 and ImageNet.'}, 'zh': {'title': '直接判别优化:提升生成模型的新方法', 'desc': '本文提出了一种新的方法,称为直接判别优化(DDO),旨在提高生成模型的质量。DDO通过将生成训练与GAN目标结合,克服了最大似然估计(MLE)在模型容量有限时的局限性。该方法通过使用可学习的目标模型与固定参考模型之间的似然比来隐式参数化判别器,从而简化了生成器和判别器的联合训练。实验结果表明,DDO显著提高了现有扩散模型的性能,并在多个数据集上创造了新的记录。'}}}, {'id': 'https://huggingface.co/papers/2503.01063', 'title': 'AI-Invented Tonal Languages: Preventing a Machine Lingua Franca Beyond Human Understanding', 'url': 'https://huggingface.co/papers/2503.01063', 'abstract': 'This paper investigates the potential for large language models (LLMs) to develop private tonal languages for machine-to-machine (M2M) communication. Inspired by cryptophasia in human twins (affecting up to 50% of twin births) and natural tonal languages like Mandarin and Vietnamese, we implement a precise character-to-frequency mapping system that encodes the full ASCII character set (32-126) using musical semitones. Each character is assigned a unique frequency, creating a logarithmic progression beginning with space (220 Hz) and ending with tilde (50,175.42 Hz). This spans approximately 7.9 octaves, with higher characters deliberately mapped to ultrasonic frequencies beyond human perception (>20 kHz). Our implemented software prototype demonstrates this encoding through visualization, auditory playback, and ABC musical notation, allowing for analysis of information density and transmission speed. Testing reveals that tonal encoding can achieve information rates exceeding human speech while operating partially outside human perceptual boundaries. This work responds directly to concerns about AI systems catastrophically developing private languages within the next five years, providing a concrete prototype software example of how such communication might function and the technical foundation required for its emergence, detection, and governance.', 'score': 0, 'issue_id': 2515, 'pub_date': '2025-03-02', 'pub_date_card': {'ru': '2 марта', 'en': 'March 2', 'zh': '3月2日'}, 'hash': '7021403742a91f3e', 'authors': ['David Noever'], 'affiliations': ['PeopleTec, Inc., Huntsville, AL'], 'pdf_title_img': 'assets/pdf/title_img/2503.01063.jpg', 'data': {'categories': ['#security', '#ethics', '#audio', '#multimodal'], 'emoji': '🎵', 'ru': {'title': 'Тональные языки: секретный код машин будущего', 'desc': 'Это исследование изучает потенциал больших языковых моделей (LLM) для разработки приватных тональных языков для коммуникации между машинами. Авторы создали систему кодирования, которая сопоставляет каждому символу ASCII уникальную частоту, формируя логарифмическую прогрессию от 220 Гц до 50,175.42 Гц. Разработанный программный прототип демонстрирует это кодирование через визуализацию, воспроизведение звука и нотацию ABC. Тестирование показало, что тональное кодирование может достигать скорости передачи информации, превышающей человеческую речь, при этом частично работая за пределами человеческого восприятия.'}, 'en': {'title': 'Unlocking Machine Communication with Tonal Languages', 'desc': 'This paper explores how large language models (LLMs) can create private tonal languages for communication between machines. It draws inspiration from the phenomenon of cryptophasia in twins and uses a character-to-frequency mapping system to encode ASCII characters into musical tones. Each character is assigned a unique frequency, allowing for efficient data transmission that exceeds human speech rates. The study provides a prototype that visualizes and plays back this encoding, addressing concerns about AI developing private languages and offering a framework for understanding and managing such systems.'}, 'zh': {'title': '探索机器间的私有音调语言', 'desc': '本论文研究了大型语言模型(LLMs)在机器间(M2M)通信中开发私有音调语言的潜力。我们借鉴了人类双胞胎中的密码语言现象和自然音调语言,如普通话和越南语,实施了一种精确的字符到频率映射系统。每个字符被分配一个独特的频率,形成一个对数进程,覆盖约7.9个八度,并将高频字符映射到人类听觉范围之外的超声波频率。我们的软件原型展示了这种编码的可视化、听觉播放和音乐记谱法,分析了信息密度和传输速度,测试结果表明音调编码的信息传输速率超过人类语言。'}}}, {'id': 'https://huggingface.co/papers/2503.00729', 'title': 'CLEA: Closed-Loop Embodied Agent for Enhancing Task Execution in Dynamic Environments', 'url': 'https://huggingface.co/papers/2503.00729', 'abstract': "Large Language Models (LLMs) exhibit remarkable capabilities in the hierarchical decomposition of complex tasks through semantic reasoning. However, their application in embodied systems faces challenges in ensuring reliable execution of subtask sequences and achieving one-shot success in long-term task completion. To address these limitations in dynamic environments, we propose Closed-Loop Embodied Agent (CLEA) -- a novel architecture incorporating four specialized open-source LLMs with functional decoupling for closed-loop task management. The framework features two core innovations: (1) Interactive task planner that dynamically generates executable subtasks based on the environmental memory, and (2) Multimodal execution critic employing an evaluation framework to conduct a probabilistic assessment of action feasibility, triggering hierarchical re-planning mechanisms when environmental perturbations exceed preset thresholds. To validate CLEA's effectiveness, we conduct experiments in a real environment with manipulable objects, using two heterogeneous robots for object search, manipulation, and search-manipulation integration tasks. Across 12 task trials, CLEA outperforms the baseline model, achieving a 67.3% improvement in success rate and a 52.8% increase in task completion rate. These results demonstrate that CLEA significantly enhances the robustness of task planning and execution in dynamic environments.", 'score': 0, 'issue_id': 2514, 'pub_date': '2025-03-02', 'pub_date_card': {'ru': '2 марта', 'en': 'March 2', 'zh': '3月2日'}, 'hash': '57f6361f66ec99cf', 'authors': ['Mingcong Lei', 'Ge Wang', 'Yiming Zhao', 'Zhixin Mai', 'Qing Zhao', 'Yao Guo', 'Zhen Li', 'Shuguang Cui', 'Yatong Han', 'Jinke Ren'], 'affiliations': ['Guangdong Provincial Key Laboratory of Future Networks of Intelligence, The Chinese University of Hong Kong, Shenzhen', 'Harbin Engineering University, Harbin', 'Infused Synapse AI, Shenzhen', 'Institute of Medical Robotics, School of Biomedical Engineering, Shanghai Jiao Tong University, Shanghai', 'School of Science and Engineering (SSE), FNii-Shenzhen', 'Shenzhen Future Network of Intelligence Institute (FNii-Shenzhen)'], 'pdf_title_img': 'assets/pdf/title_img/2503.00729.jpg', 'data': {'categories': ['#architecture', '#reasoning', '#open_source', '#robotics', '#agents', '#optimization'], 'emoji': '🤖', 'ru': {'title': 'CLEA: Повышение надежности выполнения задач роботами с помощью языковых моделей', 'desc': 'Статья представляет новую архитектуру под названием CLEA (Closed-Loop Embodied Agent) для улучшения выполнения сложных задач роботами в динамических средах. CLEA использует четыре специализированные языковые модели с открытым исходным кодом для управления задачами в замкнутом цикле. Ключевые инновации включают интерактивный планировщик задач и мультимодальный критик выполнения для оценки выполнимости действий. Эксперименты показали, что CLEA значительно превосходит базовую модель по показателям успешности и завершения задач в реальной среде с манипулируемыми объектами.'}, 'en': {'title': 'Enhancing Task Execution in Dynamic Environments with CLEA', 'desc': 'This paper introduces the Closed-Loop Embodied Agent (CLEA), a new architecture designed to improve the performance of Large Language Models (LLMs) in dynamic environments. CLEA features an interactive task planner that creates subtasks based on real-time environmental data, allowing for better adaptability. Additionally, it includes a multimodal execution critic that evaluates the feasibility of actions and adjusts plans when unexpected changes occur. Experimental results show that CLEA significantly enhances task success and completion rates compared to traditional models, demonstrating its effectiveness in complex, real-world scenarios.'}, 'zh': {'title': '闭环具身代理:提升动态环境中的任务执行能力', 'desc': '大型语言模型(LLMs)在复杂任务的层次分解和语义推理方面表现出色。然而,在具身系统中应用时,确保子任务序列的可靠执行和实现长期任务的一次性成功面临挑战。为了解决这些问题,我们提出了闭环具身代理(CLEA),这是一种新颖的架构,结合了四个专门的开源LLM,并实现功能解耦以进行闭环任务管理。通过动态生成可执行的子任务和使用多模态执行评估框架,CLEA显著提高了在动态环境中任务规划和执行的鲁棒性。'}}}];
const articlesContainer = document.getElementById('articles-container');
const sortDropdown = document.getElementById('sort-dropdown');
const categoryFiltersContainer = document.getElementById('category-filters');
const categoryFiltersLogicOptions = document.getElementById('category-options');
const categoryToggle = document.getElementById('category-toggle');
const clearCategoriesButton = document.getElementById('clear-categories');
let selectedCategories = [];
let selectedArticles = [];
let sortBy = 'issue_id';
let showLimitHint = false;
let filterLogicIsAnd = false;
function getUrlParameters() {
const urlParams = new URLSearchParams(window.location.search);
const categoriesParam = urlParams.get('cat');
let categories = categoriesParam ? categoriesParam.split(',') : [];
categories = categories.map(element => `#${element}`);
return categories
}
function updateUrlWithCategories() {
let cleanedCategories = selectedCategories.map(element => element.replace(/^#/, ''));
const newUrl = cleanedCategories.length > 0
? `${window.location.pathname}?cat=${cleanedCategories.join(',')}`
: window.location.pathname;
console.log("cleanedCategories", cleanedCategories)
window.history.pushState({}, '', newUrl);
}
function loadSettings() {
const themeToggle = document.getElementById('theme-toggle');
const sortDropdown = document.getElementById('sort-dropdown');
const isDarkMode = localStorage.getItem('darkMode') === 'true';
let settingSortBy = localStorage.getItem('sort_by');
filterLogicIsAnd = localStorage.getItem('filter_logic_is_and') === 'true';
if (isDarkMode) {
document.body.classList.remove('light-theme');
document.body.classList.add('dark-theme');
themeToggle.checked = true;
const title = document.getElementById('doomgrad');
title.innerHTML = "hf nightly";
const titleSign = document.getElementById('doomgrad-icon');
titleSign.classList.add('rotate');
}
if ((!settingSortBy) || (settingSortBy === 'null')) {
settingSortBy = 'issue_id';
}
if (filterLogicIsAnd) {
document.getElementById('filter-logic-and').checked = true;
} else {
document.getElementById('filter-logic-or').checked = true;
}
sortDropdown.value = settingSortBy;
sortBy = settingSortBy;
}
document.getElementById('theme-toggle').addEventListener('change', toggleTheme);
document.getElementById('filter-logic-and').addEventListener('change', () => {
filterLogicIsAnd = true;
localStorage.setItem('filter_logic_is_and', 'true');
filterAndRenderArticles();
updateSelectedArticlesTitle();
});
document.getElementById('filter-logic-or').addEventListener('change', () => {
filterLogicIsAnd = false;
localStorage.setItem('filter_logic_is_and', 'false');
filterAndRenderArticles();
updateSelectedArticlesTitle();
});
function getUniqueCategories(articles) {
const categories = new Set();
articles.forEach(article => {
if (article.data && article.data.categories) {
article.data.categories.forEach(cat => categories.add(cat));
}
});
let res = Array.from(categories);
res.sort();
return res;
}
function createCategoryButtons() {
//const categories = getUniqueCategories(articlesData);
const categories = ['#3d (3)', '#agents (1)', '#agi (1)', '#alignment (2)', '#architecture (3)', '#audio (2)', '#benchmark (3)', '#cv (4)', '#data (6)', '#dataset (7)', '#diffusion (4)', '#ethics (1)', '#games (2)', '#graphs', '#hallucinations', '#healthcare', '#inference (3)', '#interpretability (1)', '#leakage (1)', '#long_context (2)', '#low_resource', '#machine_translation', '#math', '#multilingual', '#multimodal (5)', '#open_source (6)', '#optimization (12)', '#plp', '#rag (2)', '#reasoning (3)', '#rl (2)', '#rlhf (2)', '#robotics (1)', '#science', '#security (1)', '#small_models (1)', '#story_generation', '#survey', '#synthetic (2)', '#training (11)', '#transfer_learning (1)', '#video (1)'];
categories.forEach(category => {
let catNameSplitted = category.split(/(\s+)/);
let catName = catNameSplitted[0];
const button = document.createElement('span');
button.textContent = catName;
button.className = 'category-button';
if (catNameSplitted.length < 2) {
button.classList.add('inactive');
};
button.onclick = () => toggleCategory(catName, button);
categoryFiltersContainer.appendChild(button);
});
}
function toggleCategory(category, button) {
const index = selectedCategories.indexOf(category);
if (index === -1) {
selectedCategories.push(category);
button.classList.add('active');
} else {
selectedCategories.splice(index, 1);
button.classList.remove('active');
}
filterAndRenderArticles();
saveCategorySelection();