Fix bibtex definition for arxiv papers

2026-05-04 01:45:46 +02:00
parent e9d996155d
commit f56cd05890
2 changed files with 108 additions and 12 deletions
--- a/src/thesis/clean_bibliography.sh
+++ b/src/thesis/clean_bibliography.sh
@@ -3,3 +3,70 @@ sed -i "s/Héctor/H{\\\\'e}ctor/" bibliography.bib
 sed -i "s/Bombín/Bomb{\\\\'i}n/" bibliography.bib
 sed -i "s/Zémor/Z{\\\\'e}mor/" bibliography.bib
 sed -Ezi "s/\s(abstract|note|urldate|url|keywords|file) = \{[^}]*(\{[^}]*\}[^}]*)*\},?\n//g" bibliography.bib
+
+# Normalize arXiv-only entries to @misc with howpublished = {arXiv:<id>}.
+# Detection: doi matches 10.48550/arXiv.<id>. The IEEEtranSA .bst's @article
+# handler needs a journal field (which preprints lack) and ignores publisher,
+# so for arXiv preprints we coerce the type to @misc and add howpublished
+# (the field the .bst actually prints for @misc).
+python3 - <<'PY'
+import re
+
+path = "bibliography.bib"
+with open(path) as f:
+    text = f.read()
+
+doi_re = re.compile(r"doi\s*=\s*\{10\.48550/arXiv\.([^}]+)\}")
+type_re = re.compile(r"^@([A-Za-z]+)\{", re.MULTILINE)
+howpublished_re = re.compile(r"^\s*howpublished\s*=\s*\{", re.MULTILINE)
+
+# Split into entries by scanning for top-level "@type{...}" blocks. We walk
+# brace depth so that the closing "}" of the entry is matched correctly even
+# if internal fields contain braces.
+def split_entries(s):
+    out, i, n = [], 0, len(s)
+    while i < n:
+        m = type_re.search(s, i)
+        if not m:
+            out.append(("text", s[i:]))
+            break
+        if m.start() > i:
+            out.append(("text", s[i:m.start()]))
+        depth, j = 0, m.start()
+        while j < n:
+            c = s[j]
+            if c == "{":
+                depth += 1
+            elif c == "}":
+                depth -= 1
+                if depth == 0:
+                    j += 1
+                    break
+            j += 1
+        out.append(("entry", s[m.start():j]))
+        i = j
+    return out
+
+def transform(entry):
+    doi_m = doi_re.search(entry)
+    if not doi_m:
+        return entry
+    arxiv_id = doi_m.group(1)
+    entry = type_re.sub("@misc{", entry, count=1)
+    if not howpublished_re.search(entry):
+        # insert howpublished as the last field, before the entry-closing "}"
+        entry = re.sub(
+            r"(,?)(\s*)\}\s*$",
+            lambda m: ("," if m.group(1) != "," else m.group(1))
+                      + m.group(2) + "\thowpublished = {arXiv:" + arxiv_id + "},\n}",
+            entry,
+            count=1,
+        )
+    return entry
+
+parts = split_entries(text)
+new_text = "".join(transform(p) if kind == "entry" else p for kind, p in parts)
+
+with open(path, "w") as f:
+    f.write(new_text)
+PY