ma-thesis/src/thesis/clean_bibliography.sh

sed -i "s/Świerkowska/{\\\\'S}wierkowska/" bibliography.bib
sed -i "s/Héctor/H{\\\\'e}ctor/" bibliography.bib
sed -i "s/Bombín/Bomb{\\\\'i}n/" bibliography.bib
sed -i "s/Zémor/Z{\\\\'e}mor/" bibliography.bib
sed -Ezi "s/\s(abstract|note|urldate|url|keywords|file) = \{[^}]*(\{[^}]*\}[^}]*)*\},?\n//g" bibliography.bib

# Normalize arXiv-only entries to @misc with howpublished = {arXiv:<id>}.
# Detection: doi matches 10.48550/arXiv.<id>. The IEEEtranSA .bst's @article
# handler needs a journal field (which preprints lack) and ignores publisher,
# so for arXiv preprints we coerce the type to @misc and add howpublished
# (the field the .bst actually prints for @misc).
python3 - <<'PY'
import re

path = "bibliography.bib"
with open(path) as f:
    text = f.read()

doi_re = re.compile(r"doi\s*=\s*\{10\.48550/arXiv\.([^}]+)\}")
type_re = re.compile(r"^@([A-Za-z]+)\{", re.MULTILINE)
howpublished_re = re.compile(r"^\s*howpublished\s*=\s*\{", re.MULTILINE)
title_field_re = re.compile(r"\b(title|booktitle)\s*=\s*\{", re.IGNORECASE)
inner_brace_re = re.compile(r"\{([A-Za-z0-9]+)\}")

# Split into entries by scanning for top-level "@type{...}" blocks. We walk
# brace depth so that the closing "}" of the entry is matched correctly even
# if internal fields contain braces.
def split_entries(s):
    out, i, n = [], 0, len(s)
    while i < n:
        m = type_re.search(s, i)
        if not m:
            out.append(("text", s[i:]))
            break
        if m.start() > i:
            out.append(("text", s[i:m.start()]))
        depth, j = 0, m.start()
        while j < n:
            c = s[j]
            if c == "{":
                depth += 1
            elif c == "}":
                depth -= 1
                if depth == 0:
                    j += 1
                    break
            j += 1
        out.append(("entry", s[m.start():j]))
        i = j
    return out

def normalize_arxiv(entry):
    doi_m = doi_re.search(entry)
    if not doi_m:
        return entry
    arxiv_id = doi_m.group(1)
    entry = type_re.sub("@misc{", entry, count=1)
    if not howpublished_re.search(entry):
        # insert howpublished as the last field, before the entry-closing "}"
        entry = re.sub(
            r"(,?)(\s*)\}\s*$",
            lambda m: ("," if m.group(1) != "," else m.group(1))
                      + m.group(2) + "\thowpublished = {arXiv:" + arxiv_id + "},\n}",
            entry,
            count=1,
        )
    return entry

# Strip protective braces around words inside title/booktitle values.
# BibTeX uses "{Word}" inside titles to preserve case against the bibliography
# style's title-casing rules. We keep that protection only when every character
# inside the braces is non-lowercase (e.g. acronyms like {NASA}); for ordinary
# words like {Quantum} we drop the braces so the style's casing applies.
def strip_title_braces(entry):
    out, i, n = [], 0, len(entry)
    while True:
        m = title_field_re.search(entry, i)
        if not m:
            out.append(entry[i:])
            break
        out.append(entry[i:m.end()])
        depth, j = 1, m.end()
        while j < n and depth > 0:
            c = entry[j]
            if c == "{":
                depth += 1
            elif c == "}":
                depth -= 1
                if depth == 0:
                    break
            j += 1
        value = entry[m.end():j]
        cleaned = inner_brace_re.sub(
            lambda mm: mm.group(1) if any(c.islower() for c in mm.group(1)) else mm.group(0),
            value,
        )
        out.append(cleaned)
        if j < n:
            out.append(entry[j])
        i = j + 1
    return "".join(out)

def transform(entry):
    return strip_title_braces(normalize_arxiv(entry))

parts = split_entries(text)
new_text = "".join(transform(p) if kind == "entry" else p for kind, p in parts)

with open(path, "w") as f:
    f.write(new_text)
PY