forked from alshedivat/al-folio
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpapers.bib
More file actions
2176 lines (2035 loc) · 267 KB
/
papers.bib
File metadata and controls
2176 lines (2035 loc) · 267 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@inproceedings{kelber2026retrieval,
author = {Florian Kelber and Matthias Jobst and Yuni Susanti and Michael F{\"a}rber},
title = {Do We Need Bigger Models for Science? Task-Aware Retrieval with Small Language Models},
booktitle = {LREC},
year = {2026},
url = {https://arxiv.org/abs/2604.01965},
abstract = {Scientific knowledge discovery increasingly relies on large language models, yet many existing scholarly assistants depend on proprietary systems with tens or hundreds of billions of parameters. We investigate whether carefully designed retrieval pipelines can compensate for reduced model scale. We propose a lightweight retrieval-augmented framework with task-aware routing that selects specialized retrieval strategies based on the query, integrates full-text papers and structured metadata, and uses compact instruction-tuned models for citation-grounded responses. Evaluations across scholarly QA, multi-document reasoning, biomedical QA under domain shift, and text compression show that retrieval and model scale are complementary: retrieval design can partially offset smaller models, but model capacity remains crucial for complex reasoning.},
tldr = {We show that task-aware retrieval pipelines combined with small instruction-tuned models can achieve strong performance on scholarly tasks, partially compensating for reduced model size. However, model scale remains important for complex reasoning, highlighting a complementary relationship between retrieval design and model capacity.},
plain = {Do we really need huge AI models for science? Not always. With smart retrieval of relevant papers, smaller models can already perform well—but very complex reasoning still benefits from larger models.},
keywords = {retrieval-augmented generation, scholarly QA, small language models, task-aware retrieval},
pdf = {publications/Task-Aware_Retrieval_NSLP-LREC2026.pdf}
}
@inproceedings{besrour2026unarxive,
author = {Ines Besrour and Michael F{\"a}rber},
title = {unarXive 2024: A Large-Scale Scientific Corpus for Citation-Aware Retrieval and Generation},
booktitle = {LREC},
year = {2026},
abstract = {Full-text collections of scientific papers are essential for NLP research and the training of language models. However, existing resources remain incomplete: they often lag behind the fast-paced growth of scientific publishing, lack comprehensive citation networks, and discard essential structural elements. In this work, we introduce unarXive 2024, a large-scale, richly structured corpus containing every arXiv submission from January 1991 to December 2024 – over 2.28 million documents across physics, mathematics, computer science, and other fields. Our release enhances each paper with detailed metadata, reconstructs a substantially more complete citation network than existing datasets, and preserves fine-grained structural information, including section boundaries, mathematical notation, and non-textual elements. Beyond the corpus itself, we provide dense and sparse indexes optimized for retrieval-augmented generation (RAG) over the full arXiv archive. All resources, including code and data, are publicly available: https://github.com/faerber-lab/unarXive-2024},
url = {https://github.com/faerber-lab/unarXive-2024},
pdf = {publications/unarXive_LREC2026.pdf},
tldr = {We introduce unarXive 2024, a 2.28M-paper corpus with improved citation resolution (39.5M links) and structured full text, enabling large-scale citation-aware retrieval and RAG.},
plain = {We collected millions of scientific papers and organized them so computers can better understand what they say and how they reference each other.}
}
@inproceedings{schopf2026idea,
author = {Tim Schopf and Michael F{\"a}rber},
title = {Is this Idea Novel? An Automated Benchmark for Judgment of Research Ideas},
booktitle = {LREC},
year = {2026},
abstract = {Judging the novelty of research ideas is crucial for advancing science, enabling the identification of unexplored directions, and ensuring contributions meaningfully extend existing knowledge rather than reiterate minor variations. However, given the exponential growth of scientific literature, manually judging the novelty of research ideas through literature reviews is labor-intensive, subjective, and infeasible at scale. Therefore, recent efforts have proposed automated approaches for research idea novelty judgment. Yet, evaluation of these approaches remains largely inconsistent and is typically based on non-standardized human evaluations, hindering large-scale, comparable evaluations. To address this, we introduce RINoBench, the first comprehensive benchmark for large-scale evaluation of research idea novelty judgments. It comprises 1,381 research ideas derived from and judged by human experts as well as nine automated evaluation metrics designed to assess both rubric-based novelty scores and textual justifications of novelty judgments. Using this benchmark, we evaluate several state-of-the-art large language models (LLMs) on their ability to judge the novelty of research ideas. Our findings reveal that while LLM-generated reasoning closely mirrors human rationales, this alignment does not reliably translate into accurate novelty judgments, which diverge significantly from human gold standard judgments—even among leading reasoning-capable models.},
url = {https://github.com/TimSchopf/RINoBench},
pdf = {publications/IdeaNovel_LREC2026.pdf},
tldr = {We introduce RINoBench, a large-scale benchmark for evaluating how well language models judge the novelty of research ideas, showing that current models produce plausible reasoning but fail to make accurate novelty judgments.},
plain = {We built a dataset to test whether AI can judge if a research idea is new. Current models can explain their reasoning well, but they often still get the final decision wrong.}
}
@inproceedings{yuan2026codae,
author = {Shuzhou Yuan and William LaCroix and Hardik Ghoshal and Ercong Nie and Michael F{\"a}rber},
title = {CoDAE: Adapting Large Language Models for Education via Chain-of-Thought Data Augmentation},
booktitle = {LREC},
year = {2026},
url = {https://arxiv.org/abs/2508.08386},
doi = {10.48550/arXiv.2508.08386},
abstract = {Large Language Models (LLMs) are increasingly used as AI tutors, but often fail to provide pedagogically appropriate guidance. They may reveal answers too quickly, struggle to adapt to student uncertainty, and remain vulnerable to manipulative prompts. We introduce CoDAE, a framework that adapts LLMs for educational use via Chain-of-Thought (CoT) data augmentation. Using real-world student–tutor dialogues, we enrich training data with step-by-step reasoning and targeted dialogue cases addressing over-compliance, low adaptivity, and threat vulnerability. Fine-tuning multiple open-source LLMs on these datasets leads to improved pedagogical behavior, stronger reasoning support, and increased robustness against premature answer disclosure.},
tldr = {We propose CoDAE, a CoT-based data augmentation framework to adapt LLMs for educational settings. Fine-tuned models provide more pedagogically aligned guidance, better reasoning support, and improved robustness against over-compliance and manipulation.},
plain = {AI tutors often give answers too quickly or can be manipulated. We improve them by training on better example dialogues, helping them guide students step by step instead of just giving solutions.},
keywords = {educational NLP, chain-of-thought, data augmentation, LLM fine-tuning},
pdf = {publications/CoDAE_LREC2026.pdf}
}
@inproceedings{thellmann2026eu20,
author = {Klaudia Thellmann and Bernhard Stadler and Michael F{\"a}rber},
title = {Diagnosing Translated Benchmarks: An Automated Quality Assurance Study of the EU20 Benchmark Suite},
booktitle = {LREC},
year = {2026},
url = {https://arxiv.org/abs/2604.01957},
keywords = {machine translation evaluation, benchmark quality, multilingual NLP},
abstract = {Machine-translated benchmark datasets reduce costs and offer scale, but noise, loss of structure, and uneven quality weaken confidence. What matters is not merely whether we can translate, but also whether we can measure and verify translation reliability at scale. We study translation quality in the EU20 benchmark suite, which comprises five established benchmarks translated into 20 languages, via a three-step automated quality assurance approach: (i) a structural corpus audit with targeted fixes; (ii) quality profiling using a neural metric (COMET, reference-free and reference-based) with translation service comparisons (DeepL / ChatGPT / Google); and (iii) an LLM-based span-level translation error landscape. Trends are consistent: datasets with lower COMET scores exhibit a higher share of accuracy/mistranslation errors at span level (notably HellaSwag; ARC is comparatively clean). Reference-based COMET on MMLU against human-edited samples points in the same direction. We release cleaned/corrected versions of the EU20 datasets, and code for reproducibility. In sum, automated quality assurance offers practical, scalable indicators that help prioritize review -- complementing, not replacing, human gold standards.},
tldr = {We analyze the multilingual EU20 benchmark suite with automated quality assurance methods combining structural checks, COMET-based scoring, and LLM-based error analysis. The results show that automated metrics can reliably surface problematic translated datasets and help prioritize manual review at scale.},
plain = {Many AI benchmarks are translated automatically into many languages, but some translations contain serious errors. We show how automatic checks can find these weak spots efficiently and help improve benchmark quality before researchers rely on them.},
pdf = {publications/EU20_Benchmarks_LREC2026.pdf}
}
@inproceedings{kummer2026prompt,
author = {Cornelius Kummer and Lena Jurkschat and Michael F{\"a}rber and Sahar Vahdati},
title = {Prompt Compression in the Wild: Measuring Latency, Rate Adherence, and Quality for Faster LLM Inference},
booktitle = {ECIR},
year = {2026},
location = {Delft, The Netherlands},
url = {https://doi.org/10.1007/978-3-032-21289-4_17},
pdf = {publications/PromptCompression_ECIR2026.pdf},
abstract = {With the widespread adoption of language models in information retrieval, especially in retrieval-augmented generation (RAG) systems, inference latency has become a key bottleneck, as retrieved passages substantially increase prompt length and computational cost. Prompt compression reduces input size while aiming to preserve downstream task performance, but its practical value depends on whether the added preprocessing time is compensated by faster decoding. We present the first systematic large-scale study of this trade-off, based on thousands of runs and 30,000 queries across several open-source LLMs and three GPU classes. Our evaluation disentangles compression overhead from decoding latency while also measuring output quality and memory usage. LLMLingua yields up to 18\% end-to-end speedups when prompt length, compression ratio, and hardware capacity are well aligned, while response quality remains statistically unchanged across summarization, code generation, and question answering tasks. Outside this operating regime, however, compression overhead dominates and offsets the gains. We further show that effective compression can reduce memory usage enough to move workloads from data-center GPUs to commodity graphics cards, with only a 0.3\,s increase in latency. Finally, we release an open-source profiler that predicts the latency break-even point for each model-hardware setup and provides practical guidance on when prompt compression is beneficial in real-world deployments.},
keywords = {prompt compression, large language models, retrieval-augmented generation, inference latency},
tldr = {We present the first large-scale empirical study of prompt compression under realistic inference conditions, showing that LLMLingua yields up to 18\% end-to-end latency reduction without statistically significant quality degradation when prompt length, compression ratio, and hardware are properly aligned, while misaligned settings let compression overhead dominate.},
plain = {Prompt compression can make language models faster and cheaper, but only in the right setting. We show when it helps in practice and when the compression step takes longer than the time it saves.}
}
@article{doi:10.1021/acscentsci.5c02031,
author = {Ahlbrecht, Jesse and Lutz, Marius D. R. and Jost, Vera and F{\"a}rber, Michael and Br{\"a}se, Stefan and Wuitschik, Georg},
title = {Which Reaction Conditions Work on Drug-Like Molecules? Lessons from 66,000 High-Throughput Experiments},
journal = {ACS Central Science},
volume = {12},
number = {2},
pages = {222--232},
year = {2026},
doi = {10.1021/acscentsci.5c02031},
URL = {https://doi.org/10.1021/acscentsci.5c02031},
abstract = {High-throughput experimentation (HTE) accelerates chemical discovery by shortening the lead times for molecule synthesis. The choice of initial reaction conditions directly influences the outcome and length of any reaction optimization. But human involvement in plate design and data analysis remains a significant cost factor and is accompanied by biases. Therefore, making the most out of past reaction outcomes is crucial. While advances in machine learning allow us to generate promising reaction conditions, this approach is often not suitable because not enough relevant reaction data are available or it is of insufficient quality. Herein we introduce a robust statistical method using z-scores to analyze 66,000 internal HTE reactions on complex molecules. Additionally, we publish the underlying data as well as a tool to analyze and draw actionable conclusions from this data set. We exemplify the power of this method for the widely employed Buchwald--Hartwig and Suzuki--Miyaura cross-coupling reactions. The results reveal optimal conditions that differ significantly from literature-based guidelines. These data-driven insights provide high-quality starting points for optimization campaigns, improving their overall efficiency.},
keywords = {high-throughput experimentation, reaction conditions, reaction optimization, z-scores, statistical analysis, Buchwald--Hartwig coupling, Suzuki--Miyaura coupling, drug-like molecules, chemical synthesis},
tldr = {We analyze 66,000 high-throughput reactions on complex, drug-like molecules using a robust z-score-based statistical method, and release both the dataset and an analysis tool, revealing data-driven optimal conditions for Buchwald--Hartwig and Suzuki--Miyaura couplings that differ markedly from literature guidelines.},
plain = {We study outcomes from 66,000 automated chemistry experiments and use a simple, robust statistic (z-scores) to identify which starting reaction settings work best. The released dataset and tool show that the best conditions for common cross-coupling reactions can differ a lot from what textbooks and papers typically recommend, giving better starting points for faster optimization.}
}
@article{DBLP:journals/corr/abs-2601-08668,
author = {Kyuri Im and Shuzhou Yuan and Michael Farber},
title = {Analyzing Bias in False Refusal Behavior of Large Language Models for Hate Speech Detoxification},
journal = {CoRR},
volume = {abs/2601.08668},
year = {2026},
url = {https://doi.org/10.48550/arXiv.2601.08668},
eprinttype = {arXiv},
abstract = {We study false refusal behavior of large language models in hate speech detoxification. Across nine models and English/multilingual datasets, refusals increase with semantic toxicity and for targets such as nationality, religion, and political ideology, with additional language-dependent biases. We propose a simple cross-translation strategy (English→Chinese→English) that significantly reduces false refusals while preserving content.}
}
@article{DBLP:journals/corr/abs-2602-00279,
author = {Philip M{\"{u}}ller and Nicholas Popovic and Michael F{\"{a}}rber and Peter Steinbach},
title = {Benchmarking Uncertainty Calibration in Large Language Model Long-Form Question Answering},
journal = {CoRR},
volume = {abs/2602.00279},
year = {2026},
url = {https://doi.org/10.48550/arXiv.2602.00279},
eprinttype = {arXiv},
eprint = {2602.00279},
abstract = {Large language models (LLMs) are increasingly used for question answering in the natural sciences and beyond, making reliable uncertainty quantification (UQ) essential for trustworthy deployment. However, existing UQ approaches remain only weakly validated in scientific QA, where factual retrieval and reasoning are both critical. We introduce the first large-scale benchmark for evaluating UQ metrics in reasoning-intensive QA and provide an extensible open-source framework for reproducible calibration assessment. Our study covers up to 20 LLMs, including base, instruction-tuned, and reasoning-oriented variants, and evaluates them on seven scientific QA datasets spanning multiple-choice and arithmetic tasks, formulated through prompting as open QA. In total, we analyze 685,000 long-form responses across varying levels of reasoning complexity representative of domain-specific settings. At the token level, we find that instruction tuning strongly polarizes probability mass, reducing the reliability of token-level confidences as uncertainty estimates. Models additionally fine-tuned for reasoning exhibit the same pattern, although the reasoning process can partially mitigate this effect depending on the provider. At the sequence level, verbalized confidence methods are systematically biased and only weakly correlated with correctness, whereas answer frequency across samples provides the most reliable calibration. We further show that relying exclusively on expected calibration error (ECE) can give a misleading picture of UQ performance on benchmark datasets. Overall, our findings reveal substantial limitations of current UQ methods for LLMs and of standard benchmarking practices in scientific QA.}
}
@article{DBLP:journals/corr/abs-2602-12828,
author = {Zhan Qu and
Michael F{\"{a}}rber},
title = {{GRAIL:} Geometry-Aware Retrieval-Augmented Inference with LLMs over Hyperbolic Representations of Patient Trajectories},
journal = {CoRR},
volume = {abs/2602.12828},
year = {2026},
url = {https://doi.org/10.48550/arXiv.2602.12828},
eprinttype = {arXiv},
eprint = {2602.12828},
abstract = {Predicting future clinical events from longitudinal electronic health records (EHRs) is challenging due to sparse multi-type observations, hierarchical medical vocabularies, and the tendency of large language models (LLMs) to hallucinate when reasoning over long structured histories. We study next-visit event prediction, which aims to forecast a patient's upcoming clinical events from prior visits. We propose GRAIL, a framework that represents longitudinal EHRs through structured geometric modeling and structure-aware retrieval. GRAIL builds a unified clinical graph by combining deterministic coding-system hierarchies with data-driven temporal associations across event types, embeds this graph in hyperbolic space, and summarizes each visit as a probabilistic Central Event to denoise sparse observations. At inference time, it retrieves a structured set of clinically plausible future events that respect hierarchical and temporal progression, and can optionally refine their ranking using an LLM as a constrained reranker. Experiments on MIMIC-IV show that GRAIL consistently improves multi-type next-visit prediction and produces forecasts that are more consistent with clinical hierarchies.}
}
@article{DBLP:journals/corr/abs-2602-12833,
author = {Zhan Qu and Michael F{\"{a}}rber},
title = {{TRACE:} Temporal Reasoning via Agentic Context Evolution for Streaming Electronic Health Records (EHRs)},
journal = {CoRR},
volume = {abs/2602.12833},
year = {2026},
url = {https://doi.org/10.48550/arXiv.2602.12833},
eprinttype = {arXiv},
eprint = {2602.12833},
abstract = {Large language models (LLMs) encode extensive medical knowledge but struggle to apply it reliably to longitudinal patient trajectories, where evolving clinical states, irregular timing, and heterogeneous events degrade performance over time. Existing adaptation strategies typically rely on fine-tuning or retrieval-based augmentation, which introduce computational overhead, privacy constraints, or instability under long contexts. We introduce TRACE (Temporal Reasoning via Agentic Context Evolution), a framework for temporal clinical reasoning with frozen LLMs that explicitly structures and maintains context rather than extending context windows or updating model parameters. TRACE operates through a dual-memory architecture comprising a static Global Protocol that captures institutional clinical rules and a dynamic Individual Protocol that tracks patient-specific state. Four agentic components---Router, Reasoner, Auditor, and Steward---coordinate over this structured memory to support temporal inference and state evolution. The framework maintains bounded inference cost through structured state compression and selectively audits safety-critical clinical decisions. Evaluated on longitudinal clinical event streams from MIMIC-IV, TRACE substantially improves next-event prediction accuracy, protocol adherence, and clinical safety over long-context and retrieval-augmented baselines, while producing interpretable and auditable reasoning traces.}
}
@article{DBLP:journals/corr/abs-2602-12869,
author = {Zhan Qu and
Michael F{\"{a}}rber},
title = {{X-VORTEX:} Spatio-Temporal Contrastive Learning for Wake Vortex Trajectory Forecasting},
journal = {CoRR},
volume = {abs/2602.12869},
year = {2026},
url = {https://doi.org/10.48550/arXiv.2602.12869},
eprinttype = {arXiv},
eprint = {2602.12869},
abstract = {Wake vortices are strong, coherent air turbulences generated by aircraft and remain a major safety and capacity challenge in air traffic management. Tracking their movement, weakening, and dissipation over time from LiDAR measurements is difficult because scans are sparse, vortex signatures fade as the flow breaks down under atmospheric turbulence and instabilities, and point-wise annotation is prohibitively expensive. Existing approaches largely treat each scan as an independent, fully supervised segmentation problem, thereby neglecting temporal structure and scaling poorly to the large unlabeled archives available in practice. We present X-VORTEX, a spatio-temporal contrastive learning framework grounded in Augmentation Overlap Theory that learns physics-aware representations from unlabeled LiDAR point cloud sequences. X-VORTEX addresses two central challenges, sensor sparsity and time-varying vortex dynamics, by constructing paired inputs from the same underlying flight event: a weakly perturbed sequence and a strongly augmented counterpart created through temporal subsampling and spatial masking. This training strategy encourages the model to align representations across missing frames and partial observations. Architecturally, a time-distributed geometric encoder extracts per-scan features, while a sequential aggregator models the evolving vortex state across variable-length sequences. Experiments on a real-world dataset containing more than one million LiDAR scans show that X-VORTEX achieves superior vortex center localization while requiring only 1\% of the labeled data needed by supervised baselines, and that its learned representations also support accurate trajectory forecasting.}
}
@inproceedings{HendlFaerber2025QuantumKG,
author = {Jonas Hendl and Michael F{\"a}rber},
title = {Incorporating Content-based Features into Quantum Knowledge Graph Embeddings},
booktitle = {AIQxQIA@ECAI},
year = {2025},
url = {https://ceur-ws.org/Vol-4153/paper2.pdf},
abstract = {Link prediction in relational data structures, such as knowledge graphs, plays a crucial role in maintaining up-to-date and accurate information. While classical approaches typically leverage either the graph's connectivity or associated textual descriptions (e.g., labels, definitions), recent quantum models have focused predominantly on structural aspects, leaving the potential of textual information largely unexplored. This paper presents the first quantum link prediction framework that combines both structural and textual modalities. We first generate classical text embeddings, apply dimensionality reduction, and encode them into quantum circuits using two complementary strategies: an Amplitude Encoding Model for high-dimensional fidelity and an Angle Encoding Model optimized for gate efficiency. Experimental results on standard benchmark datasets demonstrate that incorporating textual features in quantum architectures is not only feasible but also enhances the predictive performance of quantum link prediction models.},
keywords = {quantum machine learning, link prediction, knowledge graphs, multimodal learning, text embeddings},
tldr = {We introduce the first quantum link prediction framework that fuses knowledge-graph structure with text features, using amplitude and angle encoding to inject reduced text embeddings into quantum circuits and improve prediction accuracy.},
plain = {We teach a quantum model to “read the labels” in a knowledge graph—not just follow the links. By turning short texts into simple signals for a quantum circuit, it predicts missing connections more accurately.},
pdf = {publications/LinkPred_AIQxQIA_ECAI2025.pdf}
}
@inproceedings{loehr2026hiddenbias,
title = {The Hidden Bias: A Study on Explicit and Implicit Political Stereotypes in Large Language Models},
author = {L{\"o}hr, Konrad and Yuan, Shuzhou and F{\"a}rber, Michael},
booktitle = {EACL},
year = {2026},
url = {https://arxiv.org/pdf/2510.08236},
abstract = {Large Language Models (LLMs) are increasingly integral to information dissemination and decision-making processes. Given their growing societal influence, understanding potential biases, particularly within the political domain, is crucial to prevent undue influence on public opinion and democratic processes. This work investigates political bias and stereotype propagation across eight prominent LLMs using the two-dimensional Political Compass Test (PCT). Initially, the PCT is employed to assess the inherent political leanings of these models. Subsequently, persona prompting with the PCT is used to explore explicit stereotypes across various social dimensions. In a final step, implicit stereotypes are uncovered by evaluating models with multilingual versions of the PCT. Key findings reveal a consistent left-leaning political alignment across all investigated models. Furthermore, while the nature and extent of stereotypes vary considerably between models, implicit stereotypes elicited through language variation are more pronounced than those identified via explicit persona prompting. Interestingly, for most models, implicit and explicit stereotypes show a notable alignment, suggesting a degree of transparency or "awareness" regarding their inherent biases. This study underscores the complex interplay of political bias and stereotypes in LLMs.},
keywords = {LLMs, bias, politics, stereotypes},
tldr = {We measure political leanings and stereotypes in major LLMs and show that language variation can reveal stronger implicit bias than persona prompts.},
plain = {We run chatbots through a “political compass” quiz and then take a close look at the stereotypes they repeat, especially when we ask the same things in another language.},
pdf = {publications/Political_Bias_and_Stereotypes_in_LLM_EACL2026.pdf}
}
@inproceedings{Qu2026PoeTone,
author = {Zhan Qu and Shuzhou Yuan and Michael F{\"a}rber},
title = {PoeTone: A Framework for Constrained Generation of Structured Chinese Songci with LLMs},
abstract = {This paper presents a systematic investigation into the constrained generation capabilities of large language models (LLMs) in producing Songci, a classical Chinese poetry form characterized by strict structural, tonal, and rhyme constraints defined by Cipai templates. We first develop a comprehensive, multi-faceted evaluation framework that includes: (i) a formal conformity score, (ii) automated quality assessment using LLMs, (iii) human evaluation, and (iv) classification-based probing tasks. Using this framework, we evaluate the generative performance of 18 LLMs, including 3 proprietary models and 15 open-source models across four families, under five prompting strategies: zero-shot, one-shot, completion-based, instruction-tuned, and chain-of-thought. Finally, we propose a Generate-Critic architecture in which the evaluation framework functions as an automated critic. Leveraging the critic's feedback as a reward signal, we fine-tune three lightweight open-source LLMs via supervised fine-tuning (SFT), resulting in improvements of up to 5.88% in formal conformity. Our findings offer new insights into the generative strengths and limitations of LLMs in producing culturally significant and formally constrained literary texts.},
booktitle = {AAAI},
year = {2026},
location = {Singapore},
url = {https://doi.org/10.48550/arXiv.2508.02515},
pdf = {publications/PoeTone_AAAI2026.pdf},
keywords = {LLMs, constrained generation, Chinese poetry, Songci},
tldr = {We build a framework for generating classical Chinese Songci that satisfies strict tone and rhyme constraints.},
plain = {Songci poetry has rules like sheet music, and we teach an AI to write it while staying on the right tones and rhymes, with a strict checker that keeps it honest.},
selected = {true}
}
@inproceedings{Yuan2025Psycholinguistics,
title = {From Monolingual to Bilingual: Investigating Language Conditioning in Large Language Models for Psycholinguistic Tasks},
author = {Yuan, Shuzhou and Qu, Zhan and Tawfelis, Mario and F{\"a}rber, Michael},
booktitle = {AACL},
year = {2025},
location = {Mumbai, India},
abstract = {Large Language Models (LLMs) exhibit strong linguistic capabilities, but little is known about how they encode psycholinguistic knowledge across languages. We investigate whether and how LLMs exhibit human-like psycholinguistic responses under different linguistic identities using two tasks: sound symbolism and word valence. We evaluate two models, Llama-3.3-70B-Instruct and Qwen2.5-72B-Instruct, under monolingual and bilingual prompting in English, Dutch, and Chinese. Behaviorally, both models adjust their outputs based on prompted language identity, with Qwen showing greater sensitivity and sharper distinctions between Dutch and Chinese. Probing analysis reveals that psycholinguistic signals become more decodable in deeper layers, with Chinese prompts yielding stronger and more stable valence representations than Dutch. Our results demonstrate that language identity conditions both output behavior and internal representations in LLMs, providing new insights into their application as models of cross-linguistic cognition.},
pdf = {publications/Psycholinguistics_AACL2025.pdf},
tldr = {We show that prompting an LLM with a specific language identity changes its behavior on psycholinguistic tasks and shifts what its internal layers encode across languages.},
plain = {We show that an AI can “put on” another language like a costume and then answers differently, and even its inner signals shift when it is prompted in a different language.}
}
@inproceedings{yuan2025llmintheloop,
title = {LLM in the Loop: Creating the ParaDeHate Dataset for Hate Speech Detoxification},
author = {Yuan, Shuzhou and Nie, Ercong and Kouba, Lukas and Kangen, Ashish Yashwanth and Schmid, Helmut and Sch{\"u}tze, Hinrich and F{\"a}rber, Michael},
booktitle = {AACL},
year = {2025},
url = {https://arxiv.org/pdf/2506.01484},
abstract = {Detoxification, the task of rewriting harmful language into non-toxic text, has become increasingly important amid the growing prevalence of toxic content online. However, high-quality parallel datasets for detoxification, especially for hate speech, remain scarce due to the cost and sensitivity of human annotation. In this paper, we propose a novel LLM-in-the-loop pipeline leveraging GPT-4o-mini for automated detoxification. We first replicate the ParaDetox pipeline by replacing human annotators with an LLM and show that the LLM performs comparably to human annotation. Building on this, we construct ParaDeHate, a large-scale parallel dataset specifically for hate-speech detoxification. We release ParaDeHate as a benchmark of over 8K hate/non-hate text pairs and evaluate a wide range of baseline methods. Experimental results show that models such as BART, fine-tuned on ParaDeHate, achieve better performance in style accuracy, content preservation, and fluency, demonstrating the effectiveness of LLM-generated detoxification text as a scalable alternative to human annotation.},
tldr = {We create ParaDeHate, an LLM-in-the-loop dataset for hate-speech detoxification, enabling systematic evaluation of meaning-preserving toxic-text rewriting.},
plain = {We build a large training set that rewrites toxic text into cleaner language without changing the meaning, like running dirty water through a filter and checking the result.}
}
@inproceedings{nie2025decomposed,
title = {Decomposed Prompting: Probing Multilingual Linguistic Structure Knowledge in Large Language Models},
author = {Nie, Ercong and Yuan, Shuzhou and Ma, Bolei and Schmid, Helmut and F{\"a}rber, Michael and Kreuter, Frauke and Sch{\"u}tze, Hinrich},
booktitle = {AACL},
year = {2025},
url = {https://arxiv.org/pdf/2402.18397},
abstract = {Probing the multilingual knowledge of linguistic structure in large language models (LLMs), often characterized as sequence labeling, faces challenges with maintaining output templates in current text-to-text prompting strategies. To address this, we introduce a decomposed prompting approach for sequence labeling tasks. Instead of relying on a single text-to-text prompt, our method generates an individual prompt for each token in the input sentence, querying its linguistic label. We evaluate our approach on Universal Dependencies part-of-speech tagging across 38 languages, using both English-centric and multilingual LLMs. Our results show that decomposed prompting outperforms iterative prompting baselines in both effectiveness and efficiency under zero- and few-shot settings. Moreover, our analysis of multilingual performance in English-centric LLMs provides insights into the transferability of linguistic knowledge via multilingual prompting.},
tldr = {We introduce token-wise decomposed prompting for multilingual sequence labeling, improving accuracy and efficiency for POS tagging across 38 languages.},
plain = {Instead of asking an AI to label a whole sentence in one go, we ask about each word step by step, like a teacher calling roll, and this makes the labels more reliable across languages.}
}
@inproceedings{salsabil2025contextbased,
title = {Context-Based URL Classification for Open Access Datasets and Software in Scholarly Documents},
author = {Salsabil, Lamia and Obadage, Rochana R. and Banerjee, Bipasha and Abeysinghe, Yasasi Achinthya and Alam, Sawood and F{\"a}rber, Michael and Ingram, William and Fox, Edward and Wu, Jian},
booktitle = {JCDL},
year = {2025},
abstract = {This study presents a novel framework for automatically classifying open-access datasets and software (OADS) URLs in scholarly documents. Accurate classification of OADS-URLs is the first step in investigating the availability and preservability of OADS, a crucial step toward open science and computational reproducibility. Our framework, EnSU, leverages an ensemble-based approach to classify OADS-URLs by their citation contexts. The ensemble integrates three models: a Supervised Contrastive Learning model, a SciBERT-based model, and a BertGCN model. Our framework distinguishes the resource types (dataset vs. software) and providers (author vs. third-party). To train and evaluate EnSU, we compiled a dataset, OADS-1K, comprising 1,129 manually annotated sentences containing URLs along with their expanded contexts. Our model outperforms all baseline classifiers, including a large language model-based approach, with the best F1-score of 90%. The dataset and source code are publicly available at: https://github.com/lamps-lab/EnSU/tree/main.},
tldr = {We classify URLs in scholarly documents to identify open datasets and software links, enabling large-scale tracking of research artifacts and openness.},
plain = {We build a “link bouncer” for research papers that checks whether a URL really leads to open data or usable software, so open resources are easier to find and count.}
}
@inproceedings{PolitovSJ025,
author = {Andrei Politov and
Oleh Shkalikov and
Ren{\'{e}} J{\"{a}}kel and
Michael F{\"{a}}rber},
title = {Revisiting Projection-based Data Transfer for Cross-Lingual Named Entity Recognition in Low-Resource Languages},
booktitle = {NoDaLiDa},
location = {Tallinn, Estonia},
pages = {499--507},
year = {2025},
url = {https://aclanthology.org/2025.nodalida-1.54/},
abstract = {Cross-lingual Named Entity Recognition (NER) exploits knowledge transfer across languages to identify and classify named entities, making it particularly valuable for low-resource languages. In this paper, we show that data-based cross-lingual transfer is an effective strategy for cross-lingual NER and can outperform multilingual language models in low-resource settings. We introduce two key enhancements to the annotation projection step. First, we refine word alignments by leveraging back-translation to improve projection accuracy. Second, we propose a novel, formalized projection method that matches source-language entities with candidate entities extracted in the target language. Through extensive experiments on two datasets covering 57 languages, we demonstrate that our approach consistently outperforms existing projection-based methods for low-resource languages. These results highlight the robustness and effectiveness of projection-based data transfer as a strong alternative to model-based approaches for cross-lingual named entity recognition in low-resource scenarios.},
pdf = {publications/XNER_NoDaLiDA2025.pdf},
tldr = {We improve projection-based cross-lingual NER using back-translation and a formal matching step, outperforming baselines across 57 low-resource languages.},
plain = {We improve name detection in languages where training material is scarce by transferring labels from other languages and checking again via translation, like carbon paper plus a second copy to catch errors.}
}
@inproceedings{Claim2source,
title = {Claim2Source at CheckThat! 2025: Zero-Shot Style Transfer for Scientific Claim-Source Retrieval},
author = {Schreieder, Tobias and F{\"a}rber, Michael},
abstract = {In this paper, we present our participation in the CheckThat!~2025 Task~4b on scientific claim--source retrieval. We systematically investigate the impact of style transfer on retrieving the scientific publication referenced by COVID-19-related tweets. To this end, we apply seven distinct style transfer methods to both claims and sources and evaluate their effects on retrieval performance. The style-transferred variants are assessed across 15 retrieval systems, comprising one sparse, seven dense, and seven hybrid models, by testing all combinations of claim and source styles. The style transfer process is guided by a modular zero-shot prompting template with detailed instructions using a large language model. Our results show that GritLM-7B achieves the best performance without style transfer, indicating strong robustness to informal text. In contrast, most retrieval models, particularly sparse and hybrid approaches, benefit from transforming claims into a more formal writing style. Furthermore, we find that hybrid retrieval models generally outperform dense-only models, highlighting the advantages of combining sparse and dense retrieval paradigms for scientific claim--source retrieval.},
booktitle = {CLEF},
year = {2025},
pages = {1203--1216},
address = {Madrid, Spain},
url = {https://ceur-ws.org/Vol-4038/paper_94.pdf},
pdf = {publications/Claim2Source_CLEF2025.pdf},
tldr = {We test whether rewriting informal scientific claims in a more formal style improves claim-to-paper retrieval, and analyze when hybrid retrieval helps most.},
plain = {We show that rewriting informal claims into a cleaner, paper-like wording helps find the right scientific source, like using the correct catalog terms when searching a library.}
}
@article{Adib:25,
author = {Md Mosaddek Hossain Adib and Patrick Matalla and Christoph F\"{u}llner and Shi Li and Elias Giacoumidis and Christian Raack and Ulrich Menne and Michael Straub and Tarek Saier and Christoph Schweikert and Stefan Orf and Martin Gontscharow and Tobias K\"{a}fer and Michael F\"{a}rber and Andr\'{e} Richter and Ren\'{e} Bonk and Sebastian Randel},
journal = {J. Opt. Commun. Netw.},
keywords = {Heterodyne detection; Network topology; Networking hardware; Neural networks; Optical networks; Semiconductor optical amplifiers},
number = {3},
pages = {221--232},
publisher = {Optica Publishing Group},
title = {Optical-access networks for smart sustainable cities: from network architecture to fiber deployment},
volume = {17},
month = {Mar},
year = {2025},
url = {https://opg.optica.org/jocn/abstract.cfm?URI=jocn-17-3-221},
doi = {10.1364/JOCN.542368},
abstract = {With the steadily progressing digitization of our society and the migration into urban areas, digitized and highly connected smart cities have attracted much attention from the research community due to their impact on everyday life, potential for new innovations, and ability to reduce carbon footprints. The versatile applications, which are intended to improve life in cities in various aspects, have one thing in common---they rely on widespread, reliable, and high-performing communication networks. Therefore, optical-access networks will be a crucial part of the smart cities' network infrastructure as they provide cost-effective and high-speed connectivity to antenna sites, residents, enterprises, businesses, and regional data centers in a point-to-multipoint topology. In this article, we address the overall impact of this urban transformation on such networks. We outline our vision of the future smart sustainable city, which will leverage advanced optical-access networks. Subsequently, the physical layer design of optical-access networks is analyzed in the context of point-to-multipoint network topology. This includes a 100-Gbit/s intensity-modulation and direct-detection passive optical network (PON) and a 200-Gbit/s coherent PON utilizing eight-digital subcarrier-based time- and wavelength-division multiplexing and coherent detection. We discuss artificial intelligence-based network monitoring and resource allocation. Next, we provide a techno-economical study for sustainable fiber deployment strategies. Finally, we report the results of a network demonstration for the remote assistance of a connected autonomous vehicle.},
pdf = {publications/SmartCity_JOCN2025},
tldr = {We outline optical access-network designs for smart cities and discuss AI-based monitoring and cost-aware fiber deployment for sustainable connectivity.},
plain = {This work sketches the “roads and water pipes” of smart cities, explaining how fiber networks and AI monitoring can keep future connectivity fast, stable, and energy-aware.}
}
@inproceedings{Besrour2025SQuAI,
author = {Ines Besrour and Jingbo He and Tobias Schreieder and Michael F{\"a}rber},
title = {SQuAI: Scientific Question-Answering with Multi-Agent Retrieval-Augmented Generation},
booktitle = {CIKM},
location = {Seoul, South Korea},
year = {2025},
url = {https://dl.acm.org/doi/pdf/10.1145/3746252.3761471},
abstract = {We present SQuAI, a scalable and trustworthy multi-agent retrieval-augmented generation (RAG) framework for scientific question answering (QA) with large language models (LLMs). SQuAI addresses key limitations of existing RAG systems in the scholarly domain, where complex, open-domain questions demand accurate answers, explicit claims with citations, and retrieval across millions of scientific documents. Built on over 2.3 million full-text papers from arXiv.org, SQuAI employs four collaborative agents to decompose complex questions into sub-questions, retrieve targeted evidence via hybrid sparse-dense retrieval, and adaptively filter documents to improve contextual relevance. To ensure faithfulness and traceability, SQuAI integrates in-line citations for each generated claim and provides supporting sentences from the source documents. Our system improves faithfulness, answer relevance, and contextual relevance by up to +0.088 (12\%) over a strong RAG baseline. We further release a benchmark of 1,000 scientific question-answer-evidence triplets to support reproducibility. With transparent reasoning, verifiable citations, and domain-wide scalability, SQuAI demonstrates how multi-agent RAG enables more trustworthy scientific QA with LLMs.},
pdf = {publications/SQuAI_CIKM2025.pdf},
keywords = {scientific QA, RAG, multi-agent systems, citations, LLMs},
tldr = {SQuAI answers scientific questions via multi-agent retrieval over millions of papers and produces claim-level citations with supporting evidence sentences.},
plain = {SQuAI works like a team of librarians by splitting a hard question into smaller ones, pulling evidence from millions of papers, and answering with sources that readers can check.}
}
@inproceedings{Shao2025RealE,
author = {Chen Shao and Yue Wang and Zhenyi Zhu and Zhanbo Huang and Sebastian P{\"u}tz and Benjamin Sch{\"a}fer and Tobias K{\"a}fer and Michael F{\"a}rber},
title = {Real-E: A Foundation Benchmark for Advancing Robust and Generalizable Electricity Forecasting},
booktitle = {CIKM},
year = {2025},
location = {Seoul, South Korea},
url = {https://doi.org/10.48550/arXiv.2509.05768},
abstract = {Energy forecasting is vital for grid reliability and operational efficiency. Although recent advances in time series forecasting have led to progress, existing benchmarks remain limited in spatial and temporal scope and lack multi-energy features. This raises concerns about their reliability and applicability in real-world deployment. To address this, we present the Real-E dataset, covering over 74 power stations across 30+ European countries over a 10-year span with rich metadata. Using Real-E, we conduct an extensive data analysis and benchmark over 20 baselines across various model types. We introduce a new metric to quantify shifts in correlation structures and show that existing methods struggle on our dataset, which exhibits more complex and non-stationary correlation dynamics. Our findings highlight key limitations of current methods and offer a strong empirical basis for building more robust forecasting models.},
pdf = {publications/Real-E_CIKM2025.pdf},
keywords = {energy forecasting, time series},
tldr = {We release Real-E, a large multi-country electricity-forecasting benchmark, and show current methods struggle under non-stationary correlation shifts.},
plain = {We build a “weather station” for electricity, release a large real-world testbed, and show why models stumble when the grid’s patterns change over time.}
}
@inproceedings{Yuan2025HatefulPersona,
author = {Shuzhou Yuan and Ercong Nie and Mario Tawfelis and Helmut Schmid and Hinrich Sch{\"u}tze and Michael F{\"a}rber},
title = {Hateful Person or Hateful Model? Investigating the Role of Personas in Hate Speech Detection by Large Language Models},
booktitle = {PALS@EMNLP},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2506.08593},
abstract = {Hate speech detection is a socially sensitive and inherently subjective task, with judgments often varying based on personal traits. While prior work has examined how socio-demographic factors influence annotation, the impact of personality traits on Large Language Models (LLMs) remains largely unexplored. In this paper, we present the first comprehensive study on the role of persona prompts in hate speech classification, focusing on MBTI-based traits. A human annotation survey confirms that MBTI dimensions significantly affect labeling behavior. Extending this to LLMs, we prompt four open-source models with MBTI personas and evaluate their outputs across three hate speech datasets. Our analysis uncovers substantial persona-driven variation, including inconsistencies with ground truth, inter-persona disagreement, and logit-level biases. These findings highlight the need to carefully define persona prompts in LLM-based annotation workflows, with implications for fairness and alignment with human values.},
pdf = {publications/PersonaHate_EMNLP2025.pdf},
keywords = {hate speech, personas, MBTI, bias, LLM behavior},
tldr = {We show that persona prompts can substantially change LLM hate-speech labels, raising fairness and reliability concerns for LLM-based moderation and annotation.},
plain = {We show that switching a chatbot’s “persona mask” can change its hate-speech judgments, like getting different verdicts from the same judge in different costumes.}
}
@inproceedings{gruber2024complextempqa,
title = {ComplexTempQA: A 100m Dataset for Complex Temporal Question Answering},
author = {Raphael Gruber and Abdelrahman Abdallah and Michael Färber and Adam Jatowt},
booktitle = {EMNLP},
location = {Suzhou, China},
year = 2025,
url = {https://arxiv.org/abs/2406.04866},
abstract = {We introduce ComplexTempQA, a large-scale dataset consisting of over 100 million question-answer pairs designed to tackle the challenges in temporal question answering. ComplexTempQA significantly surpasses existing benchmarks in scale and scope. Utilizing Wikipedia and Wikidata, the dataset covers questions spanning over two decades and offers an unmatched scale. We introduce a new taxonomy that categorizes questions as attributes, comparisons, and counting questions, revolving around events, entities, and time periods, respectively. A standout feature of ComplexTempQA is the high complexity of its questions, which demand reasoning capabilities for answering such as across-time comparison, temporal aggregation, and multi-hop reasoning involving temporal event ordering and entity recognition. Additionally, each question is accompanied by detailed metadata, including specific time scopes, allowing for comprehensive evaluation of temporal reasoning abilities of large language models.},
pdf = {publications/ComplexTempQA_EMNLP2025.pdf},
keywords = {temporal QA, dataset, multi-hop reasoning, Wikidata, temporal reasoning},
tldr = {ComplexTempQA provides 100 million temporal QA pairs that require multi-hop reasoning, enabling realistic evaluation of temporal reasoning in language models.},
plain = {We create a giant time-travel quiz for AI with 100 million questions about “before”, “after”, and “during”, to see whether models truly keep track of time.}
}
@inproceedings{Popovic2025JEDI,
author = {Nicholas Popovic and Michael F{\"a}rber},
title = {Extractive Fact Decomposition for Interpretable Natural Language Inference in One Forward Pass},
booktitle = {EMNLP},
year = {2025},
location = {Suzhou, China},
url = {https://aclanthology.org/2025.emnlp-main.1615.pdf},
abstract = {Recent works in Natural Language Inference (NLI) and related tasks, such as automated fact-checking, employ atomic fact decomposition to enhance interpretability and robustness. For this, existing methods rely on resource-intensive generative large language models (LLMs) to perform decomposition. We propose JEDI, an encoder-only architecture that jointly performs extractive atomic fact decomposition and interpretable inference without requiring generative models during inference. To facilitate training, we produce a large corpus of synthetic rationales covering multiple NLI benchmarks. Experimental results demonstrate that JEDI achieves competitive accuracy in distribution and significantly improves robustness out of distribution and in adversarial settings over models based solely on extractive rationale supervision. Our findings show that interpretability and robust generalization in NLI can be realized using encoder-only architectures and synthetic rationales.},
pdf = {publications/JEDI_EMNLP2025.pdf},
keywords = {NLI, interpretability, fact decomposition, encoder-only},
tldr = {JEDI performs atomic fact decomposition and interpretable NLI in a single encoder-only forward pass, reducing reliance on expensive generative inference at test time.},
plain = {We break long statements into small fact “building blocks” and check them efficiently, like snapping sentences into LEGO pieces so the system can justify true or false.}
}
@inproceedings{Yuan2025GTRex,
author = {Shuzhou Yuan and Jingyi Sun and Ran Zhang and Michael F{\"a}rber and Steffen Eger and Pepa Atanasova and Isabelle Augenstein},
title = {Graph-Guided Textual Explanation Generation Framework},
booktitle = {EMNLP},
year = {2025},
address = {Suzhou, China},
url = {https://aclanthology.org/2025.emnlp-main.1494.pdf},
abstract = {Natural language explanations (NLEs) are commonly used to provide plausible free-text explanations of a model’s reasoning about its predictions. However, recent work has questioned their faithfulness, as they may not accurately reflect the model’s internal reasoning process regarding its predicted answer. In contrast, highlight explanations--input fragments critical for the model’s predicted answers--exhibit measurable faithfulness. Building on this foundation, we propose G-TEx, a Graph-Guided Textual Explanation Generation framework designed to enhance the faithfulness of NLEs. Specifically, highlight explanations are first extracted as faithful cues reflecting the model’s reasoning logic toward answer prediction. They are subsequently encoded through a graph neural network layer to guide the NLE generation, which aligns the generated explanations with the model’s underlying reasoning toward the predicted answer. Experiments on both encoder-decoder and decoder-only models across three reasoning datasets demonstrate that G-TEx improves NLE faithfulness by up to 12.18\% compared to baseline methods. Additionally, G-TEx generates NLEs with greater semantic and lexical similarity to human-written ones. Human evaluations show that G-TEx can decrease redundant content and enhance the overall quality of NLEs. Our work presents a novel method for explicitly guiding NLE generation to enhance faithfulness, serving as a foundation for addressing broader criteria in NLE and generated text.},
pdf = {publications/GTRex_EMNLP2025.pdf},
keywords = {explainability, NLE, graph neural networks, faithfulness, LLM},
tldr = {We generate free-text explanations guided by faithful highlight cues encoded with a graph layer, improving explanation faithfulness across reasoning datasets.},
plain = {We make explanations more like a guided tour by having the model point to evidence first and then connect it step by step, instead of telling a nice story after the fact.}
}
@inproceedings{XLLM-ACL2025,
author = {Nicholas Popovic and Ashish Kangen and Tim Schopf and Michael F{\"a}rber},
title = {In-Context Learning for Information Extraction using Fully Synthetic Demonstrations},
booktitle = {XLLM@ACL},
year = {2025},
abstract = {Large, high-quality annotated corpora remain scarce in document-level entity and relation extraction in zero-shot or few-shot settings. In this paper, we present a fully automatic, LLM-based pipeline for synthetic data generation and in-context learning for document-level entity and relation extraction. In contrast to existing approaches that rely on manually annotated demonstrations or direct zero-shot inference, our method combines synthetic data generation with retrieval-based in-context learning, using a reasoning-optimized language model. This allows us to build a high-quality demonstration database without manual annotation and to dynamically retrieve relevant examples at inference time. Based on our approach we produce a synthetic dataset of over 5k Wikipedia abstracts with approximately 59k entities and 30k relation triples. Finally, we evaluate in-context learning performance on the DocIE shared task, extracting entities and relations from long documents in a zero-shot setting. We find that in-context joint entity and relation extraction at document-level remains a challenging task, even for state-of-the-art large language models.},
keywords = {information extraction, entity extraction, relation extraction, in-context learning, synthetic data, document-level IE, LLMs},
location = {Vienna, Austria},
pdf = {publications/DocIE_XLLM-ACL2025.pdf},
tldr = {We generate synthetic demonstrations and retrieve them at inference to improve document-level entity and relation extraction in low- or zero-shot settings.},
plain = {We let the AI write its own practice questions and then pull the most useful ones at the right moment, like learning with flashcards you made yourself and picking the cards that really help.}
}
@inproceedings{SDP-ACL2025,
author = {Tim Schopf, Juraj Vladika, Michael Färber, Florian Matthes},
title = {Natural Language Inference Fine-tuning for Scientific Hallucination Detection},
booktitle = {SDP@ACL},
year = {2025},
abstract = {Modern generative Large Language Models (LLMs) are capable of generating text that sounds coherent and convincing, but are also prone to producing hallucinations, facts that contradict the world knowledge. Even in the case of Retrieval-Augmented Generation (RAG) systems, where relevant context is first retrieved and passed in the input, the generated facts can contradict or not be verifiable by the provided references. This has motivated SciHal 2025, a shared task that focuses on the detection of hallucinations for scientific content. The two subtasks focused on: (1) predicting whether a claim from a generated LLM answer is entailed, contradicted, or unverifiable by the used references; (2) predicting a fine-grained category of erroneous claims. Our best performing approach used an ensemble of fine-tuned encoder-only ModernBERT and DeBERTa-v3 models for classification. Out of nine competing teams, our approach achieved the first place in sub-task 1 and the second place in sub-task 2.},
location = {Vienna, Austria},
pdf = {publications/NLI_Scholarly-Hallucination-Detection_SDP-ACL2025.pdf},
keywords = {hallucination detection, NLI, scientific text, LLMs},
tldr = {We fine-tune NLI models to detect scientific hallucinations by checking claims against references, achieving top performance in a shared task.},
plain = {We build a science fact-checker that compares a model’s statements with the sources it cites, like a receipt check that flags items that do not match.}
}
@inproceedings{10.1145/2983323.2983324,
author = {Zhang, Lei and F\"{a}rber, Michael and Rettinger, Achim},
title = {XKnowSearch! Exploiting Knowledge Bases for Entity-based Cross-lingual Information Retrieval},
year = {2016},
isbn = {9781450340731},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/2983323.2983324},
doi = {10.1145/2983323.2983324},
abstract = {In recent years, the amount of entities in large knowledge bases available on the Web has been increasing rapidly, making it possible to propose new ways of intelligent information access. Within the context of globalization, there is a clear need for techniques and systems that can enable multilingual and cross-lingual information access. In this paper, we present XKnowSearch!, a novel entity-based system for multilingual and cross-lingual information retrieval, which supports keyword search and also allows users to influence the search process according to their search intents. By leveraging the multilingual knowledge base on the Web, keyword queries and documents can be represented in their semantic forms, which can facilitate query disambiguation and expansion, and can also overcome the language barrier between queries and documents in different languages.},
booktitle = {CIKM},
pages = {2425–2428},
keywords = {knowledge bases, information retrieval, entity-based, cross-lingual},
location = {Indianapolis, Indiana, USA},
pdf = {publications/XKnowSearch_CIKM2016.pdf},
tldr = {XKnowSearch uses knowledge-base entities to disambiguate queries and retrieve documents across languages for entity-based cross-lingual information retrieval.},
plain = {We build a cross-language search tool that uses structured background knowledge like a bilingual map, so you can find relevant texts even when the query and documents use different languages.}
}
@inproceedings{Susanti2025PathToCausality,
title = {Paths to Causality: Finding Informative Subgraphs within Knowledge Graphs for Knowledge-Based Causal Discovery},
author = {Yuni Susanti and Michael Färber},
booktitle = {KDD},
year = {2025},
address = {Toronto, Canada},
doi = {10.1145/3711896.3737076},
abstract = {Inferring causal relationships between variable pairs is crucial for understanding multivariate interactions in complex systems. Knowledge-based causal discovery—which involves inferring causal relationships by reasoning over the metadata of variables (e.g., names or textual context)—offers a compelling alternative to traditional methods that rely on observational data. However, existing methods using Large Language Models (LLMs) often produce unstable and inconsistent results, compromising their reliability for causal inference. To address this, we introduce a novel approach that integrates Knowledge Graphs (KGs) with LLMs to enhance knowledge-based causal discovery. Our approach identifies informative metapath-based subgraphs within KGs and further refines their selection using Learning-to-Rank models. The top-ranked subgraphs are then incorporated into zero-shot prompts, improving the effectiveness of LLMs in inferring the causal relationship. Extensive experiments on biomedical and open-domain datasets demonstrate that our method outperforms most baselines by up to 44.8 points in F1 scores, evaluated across diverse LLMs and KGs. Our code and datasets are available on GitHub.},
pdf = {publications/Paths_to_Causality_KDD2025.pdf},
keywords = {causal discovery, knowledge graphs, LLMs, metapaths, ranking},
tldr = {We propose a neurosymbolic method for knowledge-based causal discovery that selects relevant knowledge graph subgraphs to ground LLM prompting.},
plain = {We help AI reason about cause and effect by highlighting the most informative routes through a knowledge map, like handing a detective the best trail of clues.},
selected = {true}
}
@article{DBLP:journals/program/Farber16,
title = {Using a semantic wiki for technology forecast and technology monitoring},
author = {Michael F{\"{a}}rber},
year = {2015},
journal = {Program},
volume = {50},
number = {2},
pages = {225--242},
doi = {10.1108/PROG-06-2015-0043},
url = {https://doi.org/10.1108/PROG-06-2015-0043},
biburl = {https://dblp.org/rec/journals/program/Farber16.bib},
abstract = {The purpose of this paper is to present extensions of Semantic MediaWiki that support technology forecasting and technology monitoring. Based on requirements elicited from potential end users, namely technology experts, new visualization and analysis components were designed and implemented. The usability and applicability of these components are evaluated through task-based user studies that assess their effectiveness in realistic scenarios. The results show that, although semantic wikis are well suited for knowledge management in industrial contexts due to their user-friendly mechanisms for storing and retrieving information, they have so far rarely been applied to technology forecasting and monitoring. This work demonstrates that the additional requirements of these tasks can be fulfilled and that established techniques for technology analysis can be integrated into Semantic MediaWiki. By doing so, a new application area for Semantic MediaWiki is introduced. The presented tools and techniques provide practical value for technology and innovation management and offer open-source alternatives that can be applied in real-world settings where previously expensive dedicated software solutions were required.},
keywords = {semantic wiki, technology forecasting, technology monitoring, knowledge management, innovation management},
pdf = {publications/SemWiki_PROG2015.pdf},
tldr = {We extend Semantic MediaWiki with forecasting and monitoring features so organizations can track emerging technologies using structured knowledge and practical workflows.},
plain = {We turn a semantic wiki into a radar for emerging technologies, so teams can collect and compare new signals instead of losing them in scattered notes.}
}
@article{DBLP:journals/ws/ZhangTRFMD17,
title = {The xLiMe system: Cross-lingual and cross-modal semantic annotation, search and recommendation over live-TV, news and social media streams},
author = {Lei Zhang and Andreas Thalhammer and Achim Rettinger and Michael F{\"{a}}rber and Aditya Mogadala and Ronald Denaux},
year = 2017,
journal = {Web Semantics},
volume = {46-47},
pages = {20--30},
doi = {10.1016/J.WEBSEM.2017.03.002},
url = {https://doi.org/10.1016/j.websem.2017.03.002},
abstract = {Modern web search engines still exhibit several limitations, including the lack of search term disambiguation, the inability to handle multilingual search terms within a single query, restrictions that require retrieved media items to be in the same language as the query, and limited integration of search results across live streams of heterogeneous media sources such as television, online news, and social media. The system presented in this paper addresses these challenges by combining a media stream processing architecture with cross-lingual and cross-modal semantic annotation, search, and recommendation capabilities. All components of the system were developed within the xLiMe project.},
keywords = {semantic annotation, cross-lingual search, cross-modal search, media stream processing, recommendation systems},
pdf = {publications/xLiMe-System_JWS2017.pdf},
tldr = {We enable cross-lingual and cross-media semantic annotation, search, and recommendation over live TV, news, and social media streams.},
plain = {We build a kind of “universal remote” for information streams, linking TV, news, and social media across languages so topics can be followed beyond language barriers.}
}
@article{DBLP:journals/semweb/FarberMH18,
title = {A Linked Data Wrapper for CrunchBase},
author = {Michael F{\"{a}}rber and Carsten Menne and Andreas Harth},
year = 2018,
journal = {Semantic Web},
volume = 9,
number = 4,
pages = {505--515},
doi = {10.3233/SW-170278},
url = {https://doi.org/10.3233/SW-170278},
abstract = {CrunchBase is a database of startups and technology companies that can be searched, browsed, and edited via a web interface and is also accessible through an entity-centric HTTP API providing data in JSON format. In this paper, we present a wrapper for the CrunchBase API that exposes the data as Linked Data. The wrapper provides schema-level links to established vocabularies such as schema.org, Friend-of-a-Friend, and Vocabulary-of-a-Friend, as well as entity-level links to DBpedia for organization entities. We further describe how the RDF data can be harvested to create a local copy for advanced processing and querying beyond the capabilities of the original CrunchBase API. Finally, we outline several use cases in which both the Linked Data API for CrunchBase and the crawled CrunchBase RDF dataset have been applied in subsequent research.},
pdf = {publications/CrunchBaseWrapper_SWJ2017.pdf},
keywords = {Linked Data, APIs, RDF, companies, data integration},
tldr = {We provide Crunchbase as Linked Data, enabling richer integration and querying of startup, people, and investment data with web standards.},
plain = {We publish Crunchbase as Linked Data, turning startup and investment information into a connected map that is easier to combine, query, and analyze.}
}
@article{FarberBMR18,
title = {Linked Data Quality of DBpedia, Freebase, OpenCyc, Wikidata, and {YAGO}},
author = {Michael F{\"{a}}rber and Frederic Bartscherer and Carsten Menne and Achim Rettinger},
year = 2018,
journal = {Semantic Web},
volume = 9,
number = 1,
pages = {77--129},
doi = {10.3233/SW-170275},
url = {https://doi.org/10.3233/SW-170275},
abstract = {In recent years, several large, cross-domain, and openly available knowledge graphs (KGs) have been created, including DBpedia, Freebase, OpenCyc, Wikidata, and YAGO. Despite their widespread use, these knowledge graphs have not yet been subjected to a comprehensive comparative analysis. In this survey, we introduce a set of data quality criteria for systematically analyzing knowledge graphs and apply these criteria to compare the aforementioned KGs. In addition, we propose a framework to support the selection of the most suitable knowledge graph for a given application scenario.},
keywords = {knowledge graphs, linked data quality, data quality assessment, DBpedia, Wikidata},
pdf = {publications/KG-Comparison_SWJ2017.pdf},
tldr = {We compare major knowledge graphs using a systematic data-quality framework and help practitioners choose the right graph for their application needs.},
plain = {We create a consumer-style test report for major knowledge graphs, comparing data quality so developers can choose the right “map of the world” for their use case.},
selected = {true}
}
@article{DBLP:journals/corr/abs-1809-11099,
title = {Which Knowledge Graph Is Best for Me?},
author = {Michael F{\"{a}}rber and Achim Rettinger},
year = 2018,
journal = {CoRR},
volume = {abs/1809.11099},
url = {http://arxiv.org/abs/1809.11099},
eprinttype = {arXiv},
eprint = {1809.11099},
abstract = {In recent years, DBpedia, Freebase, OpenCyc, Wikidata, and YAGO have emerged as prominent large, cross-domain, and freely available knowledge graphs. Despite their widespread adoption, these knowledge graphs are difficult to compare within specific application settings, making it challenging for researchers and developers to select the most suitable one for their needs. In a prior in-depth survey, we defined and applied a set of data quality criteria to systematically analyze these knowledge graphs and proposed a decision-support framework for identifying the most appropriate KG for a given context. In this paper, we aim to lower the barrier to accessing these results by presenting simplified rules that map individual data quality requirements to specific knowledge graphs. This work is intended as a complement to, rather than a replacement for, the previously introduced decision-support framework, to which we refer readers seeking a well-informed selection process.},
keywords = {knowledge graphs, decision support, data quality, linked data, KG selection},
pdf = {publications/Which_KG_arXiv2018.pdf},
tldr = {We translate knowledge graph quality analysis into practical selection rules so users can pick a knowledge graph that matches their data quality requirements.},
plain = {We offer a practical “which map should I buy?” guide for knowledge graphs, turning quality checks into simple rules that help people choose.}
}
@article{DBLP:journals/corr/abs-1907-08671,
title = {Linked Crunchbase: {A} Linked Data {API} and {RDF} Data Set About Innovative Companies},
author = {Michael F{\"{a}}rber},
year = 2019,
journal = {CoRR},
volume = {abs/1907.08671},
url = {http://arxiv.org/abs/1907.08671},
doi = {10.48550/arXiv.1907.08671},
eprinttype = {arXiv},
eprint = {1907.08671},
biburl = {https://dblp.org/rec/journals/corr/abs-1907-08671.bib},
abstract = {Crunchbase is an online platform that aggregates information about startups and technology companies, including attributes and relationships among companies, people, and investments. A large portion of the data contained in Crunchbase is not available from other sources, making it a unique resource. In this paper, we present an approach for bringing Crunchbase data to the Web of Data, enabling its use in a machine-readable RDF format. We first describe the development and hosting of a Linked Data API for Crunchbase, including the integration of \texttt{sameAs} links to external data sources. We then outline our method for crawling RDF data via this API to construct a custom Crunchbase RDF knowledge graph. The resulting dataset comprises over 347 million triples, including 781{,}000 people, 659{,}000 organizations, and 343{,}000 investments. The Crunchbase Linked Data API is publicly available at \url{http://linked-crunchbase.org/}.},
keywords = {linked data, RDF, knowledge graphs, startups, innovation data},
pdf = {publications/Crunchbase_arXiv2019.pdf},
tldr = {We provide a Linked Data API and RDF dataset for Crunchbase, enabling machine-readable analytics of startups, organizations, people, and investments.},
plain = {We publish Crunchbase as a web-friendly data stream, like adding a spout so machines can “drink” structured startup information.}
}
@article{DBLP:journals/jodl/FarberJ20,
title = {Citation Recommendation: Approaches and Datasets},
author = {Michael F{\"{a}}rber and Adam Jatowt},
year = 2020,
journal = {Int. J. Digit. Libr.},
volume = 21,
number = 4,
pages = {375--405},
doi = {10.1007/S00799-020-00288-2},
url = {https://doi.org/10.1007/s00799-020-00288-2},
abstract = {Citation recommendation addresses the task of recommending relevant citations for a given text. Owing to the rapid growth of published scientific literature on the one hand and the necessity of citing the most appropriate works when authoring scientific texts on the other hand, citation recommendation has emerged as an important research area. In recent years, numerous approaches and evaluation datasets have been proposed. However, to the best of our knowledge, no comprehensive literature survey has explicitly focused on citation recommendation. In this article, we provide a thorough introduction to automatic citation recommendation research. We then present an overview of existing approaches and datasets, identifying their differences and commonalities across multiple dimensions. Finally, we discuss evaluation methods, highlight general challenges in evaluation, and outline ways to address them. While we restrict our survey to citation recommendation for scientific publications, as this document type has been studied most extensively, many of the observations and discussions are also applicable to other text types, such as news articles and encyclopedic content.},
keywords = {citation recommendation, scholarly search, literature review, datasets, information retrieval},
pdf = {publications/CiteRecSurvey_IJDL2020.pdf},
tldr = {We survey citation recommendation methods and datasets and highlight evaluation pitfalls and open challenges for assisting scientific writing.},
plain = {This survey explains citation recommendation, like a GPS for references, and summarizes which datasets and tests are needed before such tools deserve trust.},
selected = {true}
}
@article{DBLP:journals/scientometrics/SaierF20,
title = {unarXive: A Large Scholarly Dataset With Publications’ Full Text, Annotated In-Text Citations, and Links to Metadata},
author = {Tarek Saier and Michael F{\"{a}}rber},
year = 2020,
journal = {Scientometrics},
volume = 125,
number = 3,
pages = {3085--3108},
doi = {10.1007/S11192-020-03382-Z},
url = {https://doi.org/10.1007/s11192-020-03382-z},
biburl = {https://dblp.org/rec/journals/scientometrics/SaierF20.bib},
abstract = {In recent years, scholarly datasets have been employed for a wide range of tasks, including paper recommendation, citation recommendation, citation context analysis, and citation context-based document summarization. The evaluation of methods for these tasks, as well as their applicability in real-world settings, critically depends on the underlying datasets. However, existing scholarly datasets exhibit several limitations. In this paper, we introduce a new dataset comprising all publications from all scientific disciplines available on arXiv.org. In addition to providing the full plain text of papers, in-text citations are annotated using global identifiers. Moreover, both citing and cited publications are linked to the Microsoft Academic Graph, enabling access to rich bibliographic metadata. The resulting dataset contains over one million documents and 29.2 million citation contexts. Freely available for research purposes, this dataset supports more robust evaluation of paper-based and citation context-based approaches and also enables novel analyses of in-text citations, as demonstrated prototypically in this work.},
keywords = {scholarly datasets, in-text citations, citation analysis, scientific corpora, metadata},
pdf = {publications/unarXive_Scientometrics2020.pdf},
tldr = {We present unarXive, a large-scale arXiv full-text corpus with precisely linked in-text citations and rich metadata, empowering citation-aware, transparent, and reproducible scholarly NLP.},
plain = {We build a large “paper warehouse” where citations inside arXiv texts are tagged and linked, so citation analyses need far less manual cleanup.}
}
@article{FarberL21,
title = {The Data Set Knowledge Graph: Creating a Linked Open Data Source for Data Sets},
author = {Michael F{\"{a}}rber and David Lamprecht},
year = 2021,
journal = {Quant. Science Studies},
volume = 2,
number = 4,
pages = {1324--1355},
abstract = {Several scholarly knowledge graphs have been proposed to model and analyze the academic landscape; however, despite the remarkable increase in available data sets in recent years, existing knowledge graphs primarily focus on associated entities such as publications rather than on data sets themselves, and publicly available data set knowledge graphs do not systematically contain links to the publications in which data sets are mentioned. In this paper, we present an approach for constructing an RDF knowledge graph that fulfills these criteria. Our data set knowledge graph, DSKG, is publicly available and contains metadata of data sets across all scientific disciplines. To ensure high data quality, we identify suitable raw data set collections, establish links between data sets and publications modeled in the Microsoft Academic Knowledge Graph that mention these data sets, and address author name ambiguity by developing and evaluating an author name disambiguation method that enriches the knowledge graph with links to ORCID. Overall, the knowledge graph contains more than 2,000 data sets with associated properties and 814,000 links to 635,000 scientific publications, enabling a variety of use cases including advanced data set search systems and new ways of measuring and rewarding data set provisioning.},
keywords = {knowledge graphs, research datasets, linked open data, metadata, scholarly infrastructure},
pdf = {publications/DSKG_QSS2021.pdf},
tldr = {We introduce the Data Set Knowledge Graph (DSKG), the first large-scale linked open dataset that connects datasets to the scholarly papers that mention them, enabling better dataset discovery and making data contributions transparent and measurable.},
plain = {We link datasets to the papers that mention them, creating a map that helps people find data faster and gives data creators clearer credit.}
}
@article{abs-2111-05097,
title = {Cross-Lingual Citations in English Papers: {A} Large-Scale Analysis of Prevalence, Usage, and Impact},
author = {Tarek Saier and Michael F{\"{a}}rber and Tornike Tsereteli},
year = 2021,
journal = {International Journal on Digital Libraries},
url = {https://arxiv.org/abs/2111.05097},
abstract = {Citation information in scholarly data is an important source of insight into the reception of publications and scholarly discourse, yet the outcomes of citation analyses and the applicability of citation-based machine learning approaches heavily depend on data completeness. A key shortcoming of current scholarly data is the frequent exclusion of non-English publications or the absence of language metadata, which has limited the study of cross-lingual citations. In this paper, we present an analysis of cross-lingual citations based on over one million English-language papers spanning three scientific disciplines and three decades, examining differences across cited languages and disciplines, temporal trends, and the usage characteristics and impact of cross-lingual citations. Our findings include an increasing rate of citations to Chinese-language publications, a predominance of citations to local non-English languages, and consistency in citation intent between cross-lingual and monolingual citations. To facilitate further research, we make the collected data and source code publicly available.},
keywords = {cross-lingual citations, citation analysis, multilingual scholarly communication, bibliometrics},
pdf = {publications/Cross-lingual_Citations_IJDL2021.pdf},
tldr = {We analyze cross-lingual citations at scale to quantify prevalence and impact, showing how language affects scholarly visibility and citation behavior.},
plain = {We trace citations across languages like trade routes on a map, showing which languages get noticed and cited in English scientific writing.}
}
@article{WangJFY21,
title = {Improving Question Answering for Event-focused Questions in Temporal Collections of News Articles},
author = {Jiexin Wang and Adam Jatowt and Michael F{\"{a}}rber and Masatoshi Yoshikawa},
year = 2021,
journal = {Information Retrieval Journal},
volume = 24,
number = 1,
pages = {29--54},
url = {https://doi.org/10.1007/s10791-020-09387-9},
abstract = {Temporal collections of news articles contain large numbers of accurate and time-aligned documents that are valuable for understanding past events, yet access to such archives remains difficult for average users due to their size and complexity. This work addresses the task of machine reading at scale on long-term news archives, motivated by the observation that questions over news collections are typically event-centric and exhibit strong temporal characteristics. We propose a large-scale question answering model specifically designed for long-term news article collections, incorporating a dedicated module for re-ranking articles using temporal information from multiple perspectives. Experimental results demonstrate that the proposed model outperforms existing question answering systems, primarily due to the temporal re-ranking module’s ability to identify more relevant documents.},
keywords = {question answering, temporal information retrieval, news archives, event-centric QA},
pdf = {publications/ImprovingQA_IRJ2021.pdf},
tldr = {We improve question answering over long-term news archives with temporal reranking, helping users retrieve relevant evidence for event-centric questions.},
plain = {We make question answering over news archives time-aware, like a historian who knows which year to search when the question forgets to say it.}
}
@article{abs-2112-00160,
title = {Towards Full-Fledged Argument Search: {A} Framework for Extracting and Clustering Arguments from Unstructured Text},
author = {Michael F{\"{a}}rber and Anna Steyer},
year = 2021,
journal = {CoRR},
volume = {abs/2112.00160},
url = {https://arxiv.org/abs/2112.00160},
eprinttype = {arXiv},
eprint = {2112.00160},
abstract = {Argument search aims at identifying arguments in natural language texts, a task that has traditionally been addressed through a combination of keyword search and sentence- or document-level argument identification. However, existing frameworks often cover only specific components of argument search and fail to address key challenges, including argument–query matching for differently framed topics, identification of multi-sentence arguments, and topical clustering of retrieved arguments. In this paper, we propose a unified framework to overcome these limitations by combining keyword search with precomputed topic clusters for argument–query matching, introducing a novel sentence-level sequence-labeling approach for multi-sentence argument identification, and presenting aggregated arguments through topic-aware argument clustering. Experiments on several real-world debate data sets show that density-based clustering algorithms such as HDBSCAN are well suited for argument–query matching, that our BiLSTM-based sequence-labeling approach achieves a macro F1 score of 0.71, and that while fine-grained subtopic clustering of arguments remains challenging, it is a promising direction for further exploration.},
keywords = {argument mining, argument search, text clustering, information retrieval},
pdf = {publications/Towards_Full-Fledged_Argument_Search_arXiv2021.pdf},
tldr = {We propose a unified argument-search framework that extracts and clusters arguments from text for more comprehensive debate analysis.},
plain = {We build a machine that mines arguments from messy text and sorts them into clusters, like turning a chaotic debate into neat stacks of note cards.}
}
@article{abs-2112-00859,
title = {Are Investors Biased Against Women? Analyzing How Gender Affects Startup Funding in Europe},
author = {Michael F{\"{a}}rber and Alexander Klein},
year = 2021,
journal = {CoRR},
volume = {abs/2112.00859},
url = {https://arxiv.org/abs/2112.00859},
eprinttype = {arXiv},
eprint = {2112.00859},
abstract = {Raising capital from investors is a central challenge for startups, making it crucial for founders to understand whether and how gender bias affects early-stage funding decisions. While prior gender studies have predominantly focused on the US market, this paper provides a more comprehensive analysis of gender bias in European early-stage startup funding. We examine European startups listed on Crunchbase using Semantic Web technologies and analyze the relationship between the share of female founders in a founding team and the amount of funding raised. Our results show that a higher relative share of female founders has a negative impact on funding amounts and that founder characteristics influence funding outcomes differently based on gender. Furthermore, we find that gender bias is less pronounced for serial founders with prior entrepreneurial experience, as female founders benefit three times more than male founders from having previously founded a startup. Overall, our findings indicate that gender bias exists and should be considered in the context of startup funding.},
keywords = {gender bias, startup funding, innovation studies, linked data, empirical analysis},
pdf = {publications/CrunchBias_2021.pdf},
tldr = {We analyze European startup funding data and quantify how team gender composition relates to funding outcomes, providing evidence of structural bias risks.},
plain = {We analyze European startup funding like checking a pipeline, asking whether team gender composition relates to how money flows and where bias may appear.}
}
@article{DBLP:journals/qss/FarberA22,
title = {The Microsoft Academic Knowledge Graph enhanced: Author name disambiguation, publication classification, and embeddings},
author = {Michael F{\"{a}}rber and Lin Ao},
year = 2022,
journal = {Quant. Science Studies},
volume = 3,
number = 1,
pages = {51--98},
doi = {10.1162/QSS\_A\_00183},
url = {https://doi.org/10.1162/qss\_a\_00183},
abstract = {Although several large knowledge graphs have been proposed in the scholarly field, such graphs are limited with respect to several data quality dimensions such as accuracy and coverage. In this article, we present methods for enhancing the Microsoft Academic Knowledge Graph (MAKG), a recently published large-scale knowledge graph containing metadata about scientific publications and associated authors, venues, and affiliations. Based on a qualitative analysis of the MAKG, we address three aspects. First, we adopt and evaluate unsupervised approaches for large-scale author name disambiguation. Second, we develop and evaluate methods for tagging publications by their discipline and by keywords, facilitating enhanced search and recommendation of publications and associated entities. Third, we compute and evaluate embeddings for all 239 million publications, 243 million authors, 49,000 journals, and 16,000 conference entities in the MAKG based on several state-of-the-art embedding techniques. Finally, we provide statistics for the updated MAKG. Our final MAKG is publicly available at https://makg.org and can be used for the search or recommendation of scholarly entities, as well as enhanced scientific impact quantification.},
pdf = {publications/MAKG+_QSS2022.pdf},
keywords = {MAKG, author name disambiguation},
tldr = {We enhance a massive scholarly knowledge graph with improved author disambiguation, publication classification, and embeddings to improve search and analytics at scale.},
plain = {We clean and enrich a huge catalog of scientific papers, like fixing name tags and adding smart labels so large-scale search works more reliably.}
}
@inproceedings{faerber2023linked,
title = {Linked Papers With Code: The Latest in Machine Learning as an RDF Knowledge Graph},
author = {Michael Färber and David Lamprecht},
year = 2023,
booktitle = {ISWC},
location = {Athens, Greece},
url = {https://arxiv.org/abs/2310.20475},
abstract = {In this paper, we introduce Linked Papers With Code (LPWC), an RDF knowledge graph that provides comprehensive, current information about almost 400,000 machine learning publications. This includes the tasks addressed, the datasets utilized, the methods implemented, and the evaluations conducted, along with their results. Compared to its non-RDF-based counterpart Papers With Code, LPWC not only translates the latest advancements in machine learning into RDF format, but also enables novel ways for scientific impact quantification and scholarly key content recommendation. LPWC is openly accessible at this URL and is licensed under CC-BY-SA 4.0. As a knowledge graph in the Linked Open Data cloud, we offer LPWC in multiple formats, from RDF dump files to a SPARQL endpoint for direct web queries, as well as a data source with resolvable URIs and links to the data sources SemOpenAlex, Wikidata, and DBLP. Additionally, we supply knowledge graph embeddings, enabling LPWC to be readily applied in machine learning applications.},
pdf = {publications/LPWC_ISWC2023.pdf},
keywords = {Knowledge Graphs, RDF, Machine Learning, Scholarly Data, Linked Open Data, Knowledge Graph Embeddings,Papers With Code, LPWC},
tldr = {LPWC turns the ML landscape into a machine-queryable knowledge graph linking papers to tasks, datasets, methods, and results for semantic search and analysis.},
plain = {LPWC turns machine-learning research into a structured “periodic table” of papers, tasks, datasets, and results that can be searched and connected.}
}
@inproceedings{faerber2023semopenalex,
title = {SemOpenAlex: The Scientific Landscape in 26 Billion RDF Triples},
author = {Michael Färber and David Lamprecht and Johan Krause and Linn Aung and Peter Haase},
year = 2023,
booktitle = {ISWC},
address = {Athens, Greece},
url = {https://arxiv.org/abs/2308.03671},
abstract = {We present SemOpenAlex, an extensive RDF knowledge graph that contains over 26 billion triples about scientific publications and their associated entities, such as authors, institutions, journals, and concepts. SemOpenAlex is licensed under CC0, providing free and open access to the data. We offer the data through multiple channels, including RDF dump files, a SPARQL endpoint, and as a data source in the Linked Open Data cloud, complete with resolvable URIs and links to other data sources. Moreover, we provide embeddings for knowledge graph entities using high-performance computing. SemOpenAlex enables a broad range of use-case scenarios, such as exploratory semantic search via our website, large-scale scientific impact quantification, and other forms of scholarly big data analytics within and across scientific disciplines. Additionally, it enables academic recommender systems, such as recommending collaborators, publications, and venues, including explainability capabilities. Finally, SemOpenAlex can serve for RDF query optimization benchmarks, creating scholarly knowledge-guided language models, and as a hub for semantic scientific publishing.},
pdf = {publications/SemOpenAlex_ISWC2023.pdf},
keywords = {Scholarly Data, Linked Open Data, Knowledge Graph},
tldr = {We release SemOpenAlex, a scholarly knowledge graph with 26 billion triples, dumps, SPARQL access, and embeddings, enabling large-scale semantic science analytics and search.},
plain = {SemOpenAlex is an open “Google Maps for science”, built as a connected map of papers and authors so others can navigate research at web scale.},
selected = {true}
}
@article{DBLP:journals/scientometrics/FarberCY23,
title = {Biases in Scholarly Recommender Systems: Impact, Prevalence, and Mitigation},
author = {Michael F{\"{a}}rber and Melissa Coutinho and Shuzhou Yuan},
year = 2023,
journal = {Scientometrics},
volume = 128,
number = 5,
pages = {2703--2736},
doi = {10.1007/S11192-023-04636-2},
url = {https://doi.org/10.1007/s11192-023-04636-2},
abstract = {With the remarkable increase in the number of scientific entities such as publications, researchers, and scientific topics, and the associated information overload in science, academic recommender systems have become increasingly important for millions of researchers and science enthusiasts. However, it is often overlooked that these systems are subject to various biases. In this article, we first break down the biases of academic recommender systems and characterize them according to their impact and prevalence. In doing so, we distinguish between biases originally caused by humans and biases induced by the recommender system. Second, we provide an overview of methods that have been used to mitigate these biases in the scholarly domain. Based on this, third, we present a framework that can be used by researchers and developers to mitigate biases in scholarly recommender systems and to evaluate recommender systems fairly. Finally, we discuss open challenges and possible research directions related to scholarly biases.},
pdf = {publications/ScholarBias_Scientometrics2023.pdf},
keywords = {Scholarly Data, Bias, Recommender Systems, Mitigation, Survey},
tldr = {We measure biases in scholarly recommender systems and discuss mitigation strategies to support fairer literature discovery under severe paper overload.},
plain = {We show that literature recommenders can have blind spots, like a librarian who keeps pointing to the same shelves, and we outline how to measure and reduce this effect.}
}
@article{Straub2023OTDRDetection,
title = {AI-Based OTDR Event Detection, Classification and Assignment to ODN Branches in Passive Optical Networks},
author = {Straub, Michael and Saier, T. and Reber, J. and Borkowski, R. and Li, S. and Richter, A. and F{\"a}rber, Michael and K{\"a}fer, T. and Bonk, R.},
journal = {Journal of Optical Communications and Networking},
year = {2023},
doi = {10.1049/icp.2023.2469},
abstract = {An AI-supported monitoring concept is demonstrated allowing detection and classification of events on OTDR traces with high precision and recall for application on a PON optical distribution network. We can also associate events with ODN branches by using deployment data of the PON topology.},
keywords = {OTDR, Passive Optical Networks, Optical Distribution Networks, AI-based Monitoring, Event Detection, Fiber Optic Sensing},
pdf = {publications/OTDR_ECOC2023.pdf},
tldr = {We detect and classify events in optical networks from OTDR traces and assign faults to network branches, improving reliability of passive optical networks.},
plain = {We listen to echoes in fiber cables like sonar and use machine learning to spot and locate faults, improving how networks can be monitored.}
}
@inproceedings{aydin2024assessing,
title = {Assessing Privacy Policies with AI: Ethical, Legal, and Technical Challenges},
author = {Irem Aydin and Hermann Diebel-Fischer and Vincent Freiberger and Julia Möller-Klapperich and Erik Buchmann and Michael Färber and Anne Lauber-Rönsberg and Birte Platow},
year = 2024,
booktitle = {AISyS},
url = {https://arxiv.org/abs/2410.08381},
abstract = {The growing use of Machine Learning and Artificial Intelligence (AI), particularly Large Language Models (LLMs) like OpenAI’s GPT series, leads to disruptive changes across organizations. At the same time, there is a growing concern about how organizations handle personal data. Thus, privacy policies are essential for transparency in data processing practices, enabling users to assess privacy risks. However, these policies are often long and complex. This might lead to user confusion and consent fatigue, where users accept data practices against their interests, and abusive or unfair practices might go unnoticed. LLMss can be used to assess privacy policies for users automatically. In this interdisciplinary work, we explore the challenges of this approach in three pillars, namely technical feasibility, ethical implications, and legal compatibility of using LLMs to assess privacy policies. Our findings aim to identify potential for future research, and to foster a discussion on the use of LLM technologies for enabling users to fulfil their important role as decision-makers in a constantly developing AI-driven digital economy.},
pdf = {publications/Privacy_Policies_AI_AISyS2024.pdf},
keywords = {Ethics, Legal, Privacy, Philosophy},
tldr = {We analyze technical feasibility plus ethical and legal risks of using LLMs to assess privacy policies.},
plain = {We ask whether AI can read privacy policies like a magnifying glass for fine print and we lay out the technical, ethical, and legal pitfalls.}
}
@article{OTDRJOCN2024,
title = {{ML} approaches for {OTDR} diagnoses in passive optical networks - event detection and classification: ways for {ODN} branch assignment},
author = {Michael Straub and Johannes Reber and Tarek Saier and Robert Borkowski and Shi Li and Dmitry Khomchenko and Andr{\'{e}} Richter and Michael F{\"{a}}rber and Tobias K{\"{a}}fer and Ren{\'{e}} Bonk},
year = 2024,
journal = {J. Opt. Commun. Netw.},
volume = 16,
number = 7,
pages = 43,
doi = {10.1364/JOCN.516659},
url = {https://doi.org/10.1364/jocn.516659},
biburl = {https://dblp.org/rec/journals/jocnet/StraubRSBLKRFKB24.bib},
abstract = {We introduce and demonstrate an ML-supported diagnostic concept for detecting and classifying events on OTDR traces in a PON optical distribution network. By incorporating PON deployment data, events can also be associated with specific ODN branches. We analyze an ensemble classifier and neural networks, and evaluate the use of synthetic OTDR-like traces alongside measured data for training. In our proof-of-concept, an ensemble classifier achieves 98\% precision and 95\% recall on measured OTDR traces, including successful mapping to ODN branches or groups of branches. On emulated data, we obtain an average precision of 70\% and an average recall of 91\%.},
pdf = {publications/OTDR_JOCN2024.pdf},
keywords = {OTDR, PON, Machine Learning, Event Detection, Fiber Monitoring},
tldr = {We improve ML-based OTDR diagnostics using measured and synthetic traces and map events to network branches for practical fiber monitoring.},
plain = {We improve fiber fault diagnosis by training on both real and simulated signals, like training a mechanic on real cars and realistic simulators.}
}
@article{CollabScientometrics2024,
title = {Benefits of international collaboration in computer science: a case study of China, the European Union, and the United States},
author = {Alberto G{\'{o}}mez{-}Esp{\'{e}}s and Michael F{\"{a}}rber and Adam Jatowt},
year = 2024,
journal = {Scientometrics},
volume = 129,
number = 2,
pages = {1155--1171},
doi = {10.1007/S11192-023-04902-3},
url = {https://doi.org/10.1007/s11192-023-04902-3},
abstract = {Co-authored publications can yield benefits such as additional expertise, increased funding opportunities, and enhanced research impact. China, the European Union, and the United States have collaborated extensively in Computer Science over the past decades, though the scale and effects of these collaborations have changed over time. In this paper, we analyze 31 years of Computer Science publications from these three regions, examining co-authorship patterns, citation impact, and research topics. Our results show that China's growing emphasis on Computer Science has made it the most productive region in recent years, and collaborations with the EU and the US have increased its citation rates. Conversely, the EU and the US have benefited from China's expanding research output, resulting in a higher volume of joint publications.},
pdf = {publications/Collaboration_Scientometrics2024.pdf},
keywords = {International Collaboration, Co-authorship, Citation Impact, China, EU, US},
tldr = {We quantify how international collaboration in computer science relates to productivity and citation impact across China, the EU, and the United States.},
plain = {We map scientific collaboration like flight routes and study how cross-border co-authoring relates to productivity and impact over decades.}
}
@article{DBLP:journals/scientometrics/FarberT24,
title = {Analyzing the Impact of Companies on {AI} Research Based on Publications},
author = {Michael F{\"{a}}rber and Lazaros Tampakis},
year = 2023,
journal = {Scientometrics},
volume = 129,
number = 1,
pages = {31--63},
doi = {10.1007/S11192-023-04867-3},
url = {https://doi.org/10.1007/S11192-023-04867-3},
abstract = {Artificial Intelligence (AI) is one of the most momentous technologies of our time. Thus, it is of major importance to know which stakeholders influence AI research. Besides researchers at universities and colleges, researchers in companies have hardly been considered in this context. In this article, we consider how the influence of companies on AI research can be made measurable on the basis of scientific publishing activities. We compare academic- and company-authored AI publications published in the last decade and use scientometric data from multiple scholarly databases to look for differences across these groups and to disclose the top contributing organizations. While the vast majority of publications is still produced by academia, we find that the citation count an individual publication receives is significantly higher when it is (co-)authored by a company. Furthermore, using a variety of altmetric indicators, we notice that publications with company participation receive considerably more attention online. Finally, we place our analysis results in a broader context and present targeted recommendations to safeguard a harmonious balance between academia and industry in the realm of AI research.},
keywords = {scientometrics, industry influence, AI research, bibliometrics, altmetrics},
pdf = {publications/AI_Impact_Scientometrics2023.pdf},
tldr = {We measure how companies influence AI research via publishing and show differences in citation impact and online attention between industry and academia.},
plain = {We measure how strongly companies shape AI research through publishing, like tracking who is rowing and who is steering in a shared boat.}
}
@inproceedings{politov2025revisiting,
title = {Revisiting Projection-based Data Transfer for Cross-Lingual Named Entity Recognition in Low-Resource Languages},
author = {Andrei Politov and Oleh Shkalikov and René Jäkel and Michael Färber},
year = 2025,
booktitle = {NoDaLiDa/Baltic-HLT},
url = {https://arxiv.org/abs/2501.18750},
abstract = {Cross-lingual Named Entity Recognition (NER) leverages knowledge transfer between languages to identify and classify named entities, making it particularly useful for low-resource languages. We show that the data-based cross-lingual transfer method is an effective technique for cross-lingual NER and can outperform multilingual language models for low-resource languages. This paper introduces two key enhancements to the annotation projection step in cross-lingual NER for low-resource languages. First, we explore refining word alignments using back-translation to improve accuracy. Second, we present a novel formalized projection approach of matching source entities with extracted target candidates. Through extensive experiments on two datasets spanning 57 languages, we demonstrated that our approach surpasses existing projection-based methods in low-resource settings. These findings highlight the robustness of projection-based data transfer as an alternative to model-based methods for cross-lingual named entity recognition in low-resource languages.},
keywords = {cross-lingual NER, annotation projection, low-resource languages, word alignment, data transfer},
pdf = {publications/XNER_NoDaLiDA2025.pdf},
location = {Tallinn, Estonia},
tldr = {We refine projection-based cross-lingual NER transfer and show data-based transfer can outperform multilingual-model baselines in low-resource settings.},
plain = {We revisit cross-language label transfer for name detection, like tracing a drawing onto another sheet, and show how to make the tracing far more accurate.}
}
@inproceedings{DBLP:conf/clef/ZhangRFT13,
title = {A Comparative Evaluation of Cross-Lingual Text Annotation Techniques},
author = {Lei Zhang and Achim Rettinger and Michael F{\"{a}}rber and Marko Tadic},
year = 2013,
booktitle = {CLEF},
publisher = {Springer},
volume = 8138,
pages = {124--135},
doi = {10.1007/978-3-642-40802-1\_16},
url = {https://doi.org/10.1007/978-3-642-40802-1\_16},
biburl = {https://dblp.org/rec/conf/clef/ZhangRFT13.bib},
abstract = {In this paper, we address the problem of extracting knowledge from multilingual textual documents by annotating text using a cross-lingual knowledge base, namely Wikipedia. Our contribution is twofold. First, we introduce a novel evaluation framework for cross-lingual text annotation techniques that is based on annotating a parallel corpus with respect to a hub language in a cross-lingual knowledge base. Second, we analyze the performance of different cross-lingual text annotation methods within this framework. We conduct an empirical comparison of three approaches: Cross-lingual Named Entity Annotation (CL-NEA), Cross-lingual Wikifier Annotation (CL-WIFI), and Cross-lingual Explicit Semantic Analysis (CL-ESA). The results highlight the strengths and limitations of the investigated approaches and demonstrate the usefulness of the proposed evaluation framework for systematic comparison.},
keywords = {cross-lingual annotation, entity linking, Wikipedia, evaluation, multilingual NLP},
pdf = {publications/TextAnnotation_CLEF2013.pdf},
tldr = {We compare cross-lingual text annotation techniques and introduce an evaluation framework for systematic multilingual knowledge extraction.},
plain = {We compare ways to connect texts with background knowledge across languages, like labeling the same story in different tongues, and we set up a fair “test track” to compare systems.}
}
@inproceedings{DBLP:conf/esws/Farber13,
title = {Ontology-Supported Document Ranking for Novelty Search},
author = {Michael F{\"{a}}rber},
year = 2013,
booktitle = {ESWC},
location = {Montpellier, France},
publisher = {Springer},
volume = 7882,
pages = {639--644},
doi = {10.1007/978-3-642-38288-8\_43},
url = {https://doi.org/10.1007/978-3-642-38288-8\_43},
biburl = {https://dblp.org/rec/conf/esws/Farber13.bib},
abstract = {Within specific domains, users often face the challenge of populating ontologies according to their particular needs. This challenge is especially pronounced in scenarios involving novelty detection and forecasting, where users aim to integrate new information from unstructured text documents into their ontologies for subsequent knowledge-based analysis. In this paper, we propose a semantic document ranking approach that serves as a prerequisite for ontology population. By leveraging the underlying ontology for both query generation and document ranking, the approach introduces semantic structure into the retrieval process and promises improved ranking performance with respect to relevance and novelty compared to non-semantic methods.},
keywords = {novelty detection, semantic search, ontology population, document ranking, information retrieval},
pdf = {publications/NoveltySearch_ESWC2013.pdf},
tldr = {We rank documents for ontology-supported novelty search to surface novel, relevant information for knowledge-base population and monitoring.},
plain = {We use a knowledge-based compass to rank documents by what is genuinely new, like a treasure hunter looking for fresh finds rather than repeated facts.}
}
@inproceedings{DBLP:conf/dir/FarberR13,
title = {A Semantic Wiki for Novelty Search on Documents},
author = {Michael F{\"{a}}rber and Achim Rettinger},
year = 2013,
booktitle = {BIR},
location = {Delft, The Netherlands},
publisher = {CEUR-WS.org},
pages = {60--61},
url = {https://ceur-ws.org/Vol-986/paper\_6.pdf},
biburl = {https://dblp.org/rec/conf/dir/FarberR13.bib},
abstract = {Technology-oriented companies are typically interested in continuously monitoring developments related to their core technologies. However, many organizations, particularly small and medium-sized enterprises, lack efficient and systematic processes for this purpose. Existing efforts are often limited to uncoordinated keyword-based searches over web resources. In this paper, we present a semi-automatic approach that enables the structured and continuous detection of relevant, novel, and domain-specific documents on the Web. Our system is built on a semantic wiki that allows domain experts to (i) store relevant information in a structured knowledge base supporting monitoring and trend mining, and (ii) continuously import newly detected items, such as emerging technologies and their properties, into the knowledge base. Novel items are identified by generating structured queries based on the user’s contextual knowledge and by representing retrieved documents as semantic graphs. This approach facilitates the discovery of novel information in a more effective and semi-automatic manner.},
keywords = {semantic wiki, novelty search, technology monitoring, ontology population, semantic retrieval},
pdf = {publications/SemanticWiki_BIR2013.pdf},
tldr = {We present a semantic-wiki workflow for novelty search that continuously ingests and structures new web information for ongoing monitoring tasks.},
plain = {We build a semantic wiki as a watchtower that keeps pulling fresh web information into a structured store, so new signals do not get lost in the noise.}
}
@inproceedings{DBLP:conf/esws/Farber0R14,
title = {Kuphi - an Investigation Tool for Searching for and via Semantic Relations},
author = {Michael F{\"{a}}rber and Lei Zhang and Achim Rettinger},
year = 2014,
booktitle = {ESWC},
location = {Anissaras, Crete, Greece},
publisher = {Springer},
pages = {349--354},
doi = {10.1007/978-3-319-11955-7\_47},
url = {https://doi.org/10.1007/978-3-319-11955-7\_47},
biburl = {https://dblp.org/rec/conf/esws/Farber0R14.bib},
abstract = {In this work, we present Kuphi, a novel process-oriented approach to information retrieval designed for investigating entities and their semantic relations within text documents. Kuphi extends traditional bag-of-words–based search by enabling entity-centric exploration. Starting from a keyword search for a specific entity, users can not only retrieve occurrences of that entity in documents, but also search for related entities by specifying semantic relations of interest. This allows users to indirectly identify textual manifestations of relationships between entities. Through cross-lingual semantic annotation, the query language can differ from the language of the underlying documents. We demonstrate the applicability of Kuphi using DBpedia as the knowledge base and news articles collected from RSS feeds.},
keywords = {entity-centric search, semantic relations, information retrieval, cross-lingual annotation, DBpedia},
pdf = {publications/Kuphi_ESWC2014.pdf},
tldr = {Kuphi enables entity-centric search and relation-based exploration, letting users investigate documents by following semantic relations, not only keywords.},
plain = {Kuphi helps investigate texts by following who is connected to whom, like pulling a thread from one name to the next until a hidden story appears.}
}
@inproceedings{DBLP:conf/lrec/ZhangFR14,
title = {xLiD-Lexica: Cross-lingual Linked Data Lexica},
author = {Lei Zhang and Michael F{\"{a}}rber and Achim Rettinger},
year = 2014,
booktitle = {LREC},
location = {Reykjavik, Iceland},
publisher = {European Language Resources Association {(ELRA)}},
pages = {2101--2105},
url = {http://www.lrec-conf.org/proceedings/lrec2014/summaries/248.html},
biburl = {https://dblp.org/rec/conf/lrec/ZhangFR14.bib},
abstract = {In this paper, we introduce xLiD-Lexica, a set of cross-lingual linked data lexica constructed by exploiting the multilingual content of Wikipedia and resources from the Linked Open Data (LOD) cloud. The lexica provide cross-lingual groundings of linked data resources as RDF, enabling seamless integration into existing LOD datasets. In addition, we offer a SPARQL endpoint over xLiD-Lexica that allows users to conveniently access the lexica using the SPARQL query language. The availability of such multilingual and cross-lingual lexica facilitates cross-lingual information access, for example by enabling the mapping of natural language expressions in different languages to linked data resources. A wide range of natural language processing tasks, including natural language generation, cross-lingual entity linking, text annotation, and question answering, can benefit from xLiD-Lexica.},
keywords = {cross-lingual lexica, linked data, multilingual resources, RDF, SPARQL},
pdf = {publications/xLiD-Lexica_LREC2014.pdf},
tldr = {We release cross-lingual linked-data lexica that map surface forms to entities across languages, supporting multilingual annotation and information access.},
plain = {We build a cross-language lexicon that links words to the same real-world things, like a multilingual dictionary where “Paris” always means the same place.}
}
@inproceedings{DBLP:conf/semweb/0007FTR14,
title = {Exploiting Semantic Annotations for Entity-based Information Retrieval},
author = {Lei Zhang and Michael F{\"{a}}rber and Thanh Tran and Achim Rettinger},
year = 2014,
booktitle = {ISWC},
location = {Riva del Garda, Italy},
volume = 1272,
pages = {429--432},
url = {https://ceur-ws.org/Vol-1272/paper\_134.pdf},
biburl = {https://dblp.org/rec/conf/semweb/0007FTR14.bib},
abstract = {In this paper, we propose a novel approach to entity-based information retrieval that exploits semantic annotations of documents. With the growing availability of structured knowledge bases and semantic annotation techniques, documents and queries can be represented at a semantic level, thereby reducing term ambiguity and bridging language differences between queries and documents. Based on multiple semantic interpretations, users can iteratively refine their queries to better reflect their information needs. By leveraging the semantics of entities and their relations encoded in knowledge bases, we introduce a new ranking scheme designed to more effectively satisfy users’ information needs.},
keywords = {entity-based retrieval, semantic annotations, interactive search, knowledge bases, disambiguation},
pdf = {publications/EntityIR_ISWC2014.pdf},
tldr = {We show how semantic annotations support entity-based information retrieval and interactive query refinement to reduce ambiguity and improve relevance.},
plain = {We show that tagging text with real entities improves search by meaning, like swapping fuzzy keyword hunting for clear labels you can click and refine.}
}
@inproceedings{DBLP:conf/lrec/0001TJ18,
title = {A High-Quality Gold Standard for Citation-based Tasks},
author = {Michael F{\"{a}}rber and Alexander Thiemann and Adam Jatowt},
year = 2018,
booktitle = {LREC},
location = {Miyazaki, Japan},
publisher = {European Language Resources Association {(ELRA)}},
url = {http://www.lrec-conf.org/proceedings/lrec2018/summaries/283.html},
biburl = {https://dblp.org/rec/conf/lrec/0001TJ18.bib},
abstract = {Analyzing and recommending citations within their specific citation contexts has gained increasing attention due to the rapid growth of scientific literature. Although datasets such as CiteSeerX have been used to evaluate approaches for citation-dependent tasks, they suffer from substantial limitations, stemming from the challenges of information extraction, entity linking, and entity resolution. In this paper, we introduce a new evaluation dataset for citation-dependent tasks based on publications from arXiv.org. The dataset is distinguished by an almost noise-free extraction process and by the fact that all citations are correctly linked to their referenced publications. In addition to providing sentence-level textual content, cited publications are annotated directly within the text using global identifiers and, where possible, are further linked to the DBLP Computer Science Bibliography. The resulting dataset comprises over 15 million sentences and is freely available for research purposes. It supports the training and evaluation of a wide range of citation-based tasks, including citation recommendation, citation function and importance classification, and citation-based document summarization.},
keywords = {citation contexts, gold standard dataset, citation recommendation, scholarly NLP, arXiv},
pdf = {publications/CitationGold_LREC2018.pdf},
tldr = {We create a high-quality arXiv-based gold standard with correctly linked citations, enabling reliable evaluation of citation-centric NLP tasks.},
plain = {We build a carefully checked reference set where citations in arXiv full texts are linked correctly.}
}
@inproceedings{DBLP:conf/edbt/Cossu0L18,
title = {PRoST: Distributed Execution of {SPARQL} Queries Using Mixed Partitioning Strategies},
author = {Matteo Cossu and Michael F{\"{a}}rber and Georg Lausen},
year = 2018,
booktitle = {EDBT},
location = {Vienna, Austria},
pages = {469--472},
doi = {10.5441/002/EDBT.2018.49},
url = {https://doi.org/10.5441/002/edbt.2018.49},
biburl = {https://dblp.org/rec/conf/edbt/Cossu0L18.bib},
abstract = {The rapidly increasing size of RDF graphs in recent years has created a strong need for distributed storage and parallel query processing strategies. To enable efficient query processing on computer clusters, a wide range of approaches has been proposed, including systems built on top of Hadoop HDFS using technologies such as Apache Accumulo or Apache Spark. In this paper, we introduce a new RDF store called PRoST (Partitioned RDF on Spark Tables) that is based on Apache Spark. PRoST employs a novel storage strategy that combines vertical partitioning with property tables, two established models for RDF data storage. We show that PRoST outperforms state-of-the-art RDF stores in terms of query execution time across a broad spectrum of query types, while requiring no extensive precomputation phase.},
keywords = {SPARQL, distributed RDF store, Apache Spark, query processing, partitioning},
pdf = {publications/PRoST_EDBT2018.pdf},
tldr = {PRoST is a Spark-based RDF store that speeds up SPARQL by combining vertical partitioning and property tables for scalable graph querying.},
plain = {PRoST speeds up queries on linked data by changing the “warehouse layout”, so answers can be found faster on big computing clusters.}
}
@inproceedings{DBLP:conf/ecir/0001TJ18a,
title = {CITEWERTs: {A} System Combining Cite-Worthiness with Citation Recommendation},
author = {Michael F{\"{a}}rber and Alexander Thiemann and Adam Jatowt},
year = 2018,
booktitle = {ECIR},
location = {Grenoble, France},
publisher = {Springer},
pages = {815--819},
doi = {10.1007/978-3-319-76941-7\_82},
url = {https://doi.org/10.1007/978-3-319-76941-7\_82},
biburl = {https://dblp.org/rec/conf/ecir/0001TJ18a.bib},
abstract = {The vast and continuously growing number of scientific publications has created a need for automatically recommending citations for specific text segments in scholarly documents. Despite this demand, only a limited number of citation-based recommender system demonstrations have been proposed so far. Existing solutions often either ignore the raw textual context, recommend citations only for predefined citation contexts, or operate at the document level. In contrast, we propose a novel two-step architecture for citation recommendation. First, given an input text, the system determines for each potential citation context—typically at the sentence level—whether it is actually citation-worthy. If so, the system then recommends suitable citations for that context. In our demonstration, we illustrate how this architecture guides users directly to sentences that require citations and presents relevant recommendations for individual sentences, thereby reducing the effort needed to review large numbers of sentences and suggested references.},
keywords = {cite-worthiness, citation recommendation, scholarly writing, sentence-level recommendation, information retrieval},
pdf = {publications/CITEWERTs_ECIR2018.pdf},
tldr = {CITEWERTs combines cite-worthiness detection with citation recommendation to guide writers to sentences that need citations and suggest references.},
plain = {CITEWERT first checks whether a sentence really needs a citation, like a spell-checker for missing references, and then suggests suitable papers.}
}
@inproceedings{DBLP:conf/ecir/0001TJ18,
title = {To Cite, or Not to Cite? Detecting Citation Contexts in Text},
author = {Michael F{\"{a}}rber and Alexander Thiemann and Adam Jatowt},
year = 2018,
booktitle = {ECIR},
location = {Grenoble, France},
publisher = {Springer},
pages = {598--603},
doi = {10.1007/978-3-319-76941-7\_50},
url = {https://doi.org/10.1007/978-3-319-76941-7\_50},
biburl = {https://dblp.org/rec/conf/ecir/0001TJ18.bib},
abstract = {Recommending citations for scientific texts and other document types, such as news articles, has recently received considerable attention. Existing citation recommendation approaches, however, typically do not explicitly address whether a given text segment, for example a sentence, actually deserves a citation. Introducing a prior step that determines the cite-worthiness of potential citation contexts is advantageous, as it both reduces the number of computationally expensive recommendation operations and better reflects human citation behavior by avoiding excessive or insufficient recommendations. In this paper, we propose a convolutional recurrent neural network–based approach for classifying potential citation contexts with respect to their cite-worthiness. Our experimental results show that the proposed method significantly outperforms a baseline approach and reduces the number of citation recommendations to approximately one tenth.},
keywords = {cite-worthiness, citation context detection, neural networks, scholarly text mining, citation recommendation},
pdf = {publications/CitationContext_ECIR2018.pdf},
tldr = {We detect which sentences truly require citations, reducing unnecessary recommendations and better matching real scientific writing behavior.},
plain = {We teach a system to spot which sentences should cite sources.}
}
@inproceedings{DBLP:conf/semweb/Farber19,
title = {The Microsoft Academic Knowledge Graph: {A} Linked Data Source with 8 Billion Triples of Scholarly Data},
author = {Michael F{\"{a}}rber},
year = 2019,
booktitle = {ISWC},
location = {Auckland, New Zealand},
publisher = {Springer},
volume = 11779,
pages = {113--129},
doi = {10.1007/978-3-030-30796-7\_8},
url = {https://doi.org/10.1007/978-3-030-30796-7\_8},
biburl = {https://dblp.org/rec/conf/semweb/Farber19.bib},
abstract = {In this paper, we present the Microsoft Academic Knowledge Graph (MAKG), a large-scale RDF dataset comprising over eight billion triples that describe scientific publications and related entities, including authors, institutions, journals, and fields of study. The dataset is released under the Open Data Commons Attribution License (ODC-By). By providing the data both as RDF dump files and as a data source within the Linked Open Data cloud—featuring resolvable URIs and links to external data sources—we make a substantial body of scholarly data available on the Web of Data. In addition, we provide entity embeddings for all 210 million publications represented in the graph. The MAKG supports a wide range of use cases, particularly in the domain of digital libraries, such as (1) entity-centric exploration of publications, researchers, and affiliations; (2) data integration tasks using RDF as a common data model and interlinking with other datasets; and (3) large-scale data analysis and knowledge discovery over scholarly data.},
keywords = {scholarly knowledge graph, linked data, RDF dataset, Microsoft Academic Graph, entity embeddings},
pdf = {publications/MAKG_ISWC2019.pdf},
tldr = {We release the Microsoft Academic Knowledge Graph in RDF with billions of triples and embeddings, enabling large-scale scholarly integration and analytics.},
plain = {We publish a huge linked dataset of scholarly metadata, like turning a library card catalog into a connected web map that machines can navigate.}
}
@inproceedings{DBLP:conf/clef/0001QA19,
title = {Identifying Twitter Bots Using a Convolutional Neural Network},
author = {Michael F{\"{a}}rber and Agon Qurdina and Lule Ahmedi},