Start implementing snowballing functionality; add logging; properly sanitize abstracts

This commit is contained in:
Andreas Tsouchlos 2025-12-30 01:50:49 +02:00
parent 14f503a554
commit 4901a2897f
6 changed files with 230 additions and 21 deletions

81
Cargo.lock generated
View File

@ -494,6 +494,29 @@ dependencies = [
"cfg-if", "cfg-if",
] ]
[[package]]
name = "env_filter"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bf3c259d255ca70051b30e2e95b5446cdb8949ac4cd22c0d7fd634d89f568e2"
dependencies = [
"log",
"regex",
]
[[package]]
name = "env_logger"
version = "0.11.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f"
dependencies = [
"anstream",
"anstyle",
"env_filter",
"jiff",
"log",
]
[[package]] [[package]]
name = "equivalent" name = "equivalent"
version = "1.0.2" version = "1.0.2"
@ -724,6 +747,15 @@ version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]]
name = "html-escape"
version = "0.2.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
dependencies = [
"utf8-width",
]
[[package]] [[package]]
name = "html5ever" name = "html5ever"
version = "0.35.0" version = "0.35.0"
@ -1059,6 +1091,30 @@ version = "1.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
[[package]]
name = "jiff"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a87d9b8105c23642f50cbbae03d1f75d8422c5cb98ce7ee9271f7ff7505be6b8"
dependencies = [
"jiff-static",
"log",
"portable-atomic",
"portable-atomic-util",
"serde_core",
]
[[package]]
name = "jiff-static"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b787bebb543f8969132630c51fd0afab173a86c6abae56ff3b9e5e3e3f9f6e58"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.111",
]
[[package]] [[package]]
name = "js-sys" name = "js-sys"
version = "0.3.83" version = "0.3.83"
@ -1549,6 +1605,15 @@ version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f89776e4d69bb58bc6993e99ffa1d11f228b839984854c7daeb5d37f87cbe950" checksum = "f89776e4d69bb58bc6993e99ffa1d11f228b839984854c7daeb5d37f87cbe950"
[[package]]
name = "portable-atomic-util"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507"
dependencies = [
"portable-atomic",
]
[[package]] [[package]]
name = "potential_utf" name = "potential_utf"
version = "0.1.4" version = "0.1.4"
@ -1793,6 +1858,9 @@ dependencies = [
"ammonia", "ammonia",
"clap", "clap",
"crossterm", "crossterm",
"env_logger",
"html-escape",
"log",
"open", "open",
"ratatui", "ratatui",
"reqwest", "reqwest",
@ -1800,6 +1868,7 @@ dependencies = [
"serde_json", "serde_json",
"textwrap", "textwrap",
"tokio", "tokio",
"unicode-general-category",
] ]
[[package]] [[package]]
@ -2495,6 +2564,12 @@ version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971"
[[package]]
name = "unicode-general-category"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b993bddc193ae5bd0d623b49ec06ac3e9312875fdae725a975c51db1cc1677f"
[[package]] [[package]]
name = "unicode-ident" name = "unicode-ident"
version = "1.0.22" version = "1.0.22"
@ -2554,6 +2629,12 @@ version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "utf8-width"
version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1292c0d970b54115d14f2492fe0170adf21d68a1de108eebc51c1df4f346a091"
[[package]] [[package]]
name = "utf8_iter" name = "utf8_iter"
version = "1.0.4" version = "1.0.4"

View File

@ -7,6 +7,9 @@ edition = "2024"
ammonia = "4.1.2" ammonia = "4.1.2"
clap = { version = "4.5.53", features = ["derive"] } clap = { version = "4.5.53", features = ["derive"] }
crossterm = "0.29.0" crossterm = "0.29.0"
env_logger = "0.11.8"
html-escape = "0.2.13"
log = "0.4.29"
open = "5.3.3" open = "5.3.3"
ratatui = "0.30.0" ratatui = "0.30.0"
reqwest = { version = "0.12.28", features = ["json"] } reqwest = { version = "0.12.28", features = ["json"] }
@ -14,3 +17,4 @@ serde = { version = "1.0.228", features = ["derive"] }
serde_json = "1.0.148" serde_json = "1.0.148"
textwrap = "0.16.2" textwrap = "0.16.2"
tokio = { version = "1.48.0", features = ["full"] } tokio = { version = "1.48.0", features = ["full"] }
unicode-general-category = "1.1.0"

View File

@ -1,7 +1,11 @@
use core::panic;
use ratatui::{crossterm::event::KeyCode, widgets::ListState}; use ratatui::{crossterm::event::KeyCode, widgets::ListState};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::snowballing::Publication; use crate::snowballing::{Publication, get_publication_by_id};
use log::{debug, info, warn, error};
#[derive(Serialize, Deserialize, Default, PartialEq)] #[derive(Serialize, Deserialize, Default, PartialEq)]
pub enum ActivePane { pub enum ActivePane {
@ -20,8 +24,8 @@ pub enum ActiveTab {
#[derive(Serialize, Deserialize, Default)] #[derive(Serialize, Deserialize, Default)]
pub enum SnowballingStep { pub enum SnowballingStep {
#[default] #[default]
Forward,
Backward, Backward,
Forward,
} }
impl ToString for SnowballingStep { impl ToString for SnowballingStep {
@ -104,14 +108,29 @@ pub struct App {
pub should_quit: bool, pub should_quit: bool,
} }
// TODO: Implement exclusion and inclusion of papers (e.g., X and Y chars)
// TODO: Implement moving through steps and iterations (populating pending papers) // TODO: Implement moving through steps and iterations (populating pending papers)
// TODO: Implement input of seed papers using IDs // TODO: Implement exclusion and inclusion of papers (e.g., X and Y chars)
// TODO: Implement possibility of pushing excluded papers back into pending // TODO: Implement possibility of pushing excluded papers back into pending
// TODO: Implement export of included papers as csv for keywording with a spreadsheet // TODO: Implement export of included papers as csv for keywording with a spreadsheet
// TODO: Implement export of included papers into zotero (Use RIS format somehow) // TODO: Implement export of included papers into zotero (Use RIS format somehow)
impl App { impl App {
pub fn handle_key(&mut self, key: KeyCode) { // TODO: Show error somehow
pub async fn add_seed_paper(&mut self, api_link: &String) {
let publ =
get_publication_by_id(api_link, "an.tsouchlos@gmail.com").await;
match publ {
Ok(publ) => self.included_publications.push(publ),
Err(err) => {
warn!(
"Failed to get publication metadata using OpenAlex API: {}",
err
);
}
}
}
pub async fn handle_key(&mut self, key: KeyCode) {
if KeyCode::Esc == key { if KeyCode::Esc == key {
self.should_quit = true; self.should_quit = true;
return; return;
@ -123,7 +142,7 @@ impl App {
self.active_tab = ActiveTab::Snowballing; self.active_tab = ActiveTab::Snowballing;
} }
KeyCode::Enter => { KeyCode::Enter => {
// TODO: Actually add paper to included list self.add_seed_paper(&self.seeding_input.clone()).await;
self.seeding_input.clear(); self.seeding_input.clear();
} }
KeyCode::Char(to_insert) => self.seeding_input.push(to_insert), KeyCode::Char(to_insert) => self.seeding_input.push(to_insert),
@ -136,6 +155,53 @@ impl App {
_ => {} _ => {}
}, },
ActiveTab::Snowballing => match key { ActiveTab::Snowballing => match key {
KeyCode::Char(' ') => {
if self.pending_publications.len() > 0 {
warn!(
"The next snowballing step can only be initiated \
after screening all pending publications"
);
// TODO: Show warning/error somehow
return;
}
match self.snowballing_step {
SnowballingStep::Forward => {
// TODO: Implement
}
SnowballingStep::Backward => {
for publication in &self.included_publications {
for reference in &publication.referenced_works {
let api_link = format!(
"https://api.openalex.org/{}",
&reference[21..]
);
// https://openalex.org/W2085881930
let publ = get_publication_by_id(
&api_link,
"an.tsouchlos@gmail.com",
)
.await;
match publ {
Ok(publ) => {
self.pending_publications.push(publ)
}
// TODO: Show error somehow
Err(err) => {
warn!(
"Failed to get publication\
metadata using OpenAlex API: \
{}",
err
);
}
}
}
}
}
}
}
KeyCode::Enter => match self.active_pane { KeyCode::Enter => match self.active_pane {
ActivePane::IncludedPublications => { ActivePane::IncludedPublications => {
if let Some(idx) = self.included_list_state.selected() { if let Some(idx) = self.included_list_state.selected() {

View File

@ -1,5 +1,6 @@
use std::{error::Error, io, time::Duration}; use std::{error::Error, io, time::Duration};
use log::{warn, error};
use ratatui::{ use ratatui::{
Terminal, Terminal,
backend::{Backend, CrosstermBackend}, backend::{Backend, CrosstermBackend},
@ -15,7 +16,7 @@ use ratatui::{
use crate::{app::App, ui}; use crate::{app::App, ui};
pub fn run(app: App) -> Result<App, Box<dyn Error>> { pub async fn run(app: App) -> Result<App, Box<dyn Error>> {
// setup terminal // setup terminal
enable_raw_mode()?; enable_raw_mode()?;
let mut stdout = io::stdout(); let mut stdout = io::stdout();
@ -24,7 +25,7 @@ pub fn run(app: App) -> Result<App, Box<dyn Error>> {
let mut terminal = Terminal::new(backend)?; let mut terminal = Terminal::new(backend)?;
// create app and run it // create app and run it
let app_result = run_app(&mut terminal, app); let app_result = run_app(&mut terminal, app).await;
// restore terminal // restore terminal
disable_raw_mode()?; disable_raw_mode()?;
@ -36,13 +37,14 @@ pub fn run(app: App) -> Result<App, Box<dyn Error>> {
terminal.show_cursor()?; terminal.show_cursor()?;
if let Err(err) = &app_result { if let Err(err) = &app_result {
error!("{err:?}");
println!("{err:?}"); println!("{err:?}");
} }
Ok(app_result?) Ok(app_result?)
} }
fn run_app<B: Backend>( async fn run_app<B: Backend>(
terminal: &mut Terminal<B>, terminal: &mut Terminal<B>,
mut app: App, mut app: App,
) -> io::Result<App> ) -> io::Result<App>
@ -52,11 +54,11 @@ where
loop { loop {
terminal.draw(|frame| ui::draw(frame, &mut app))?; terminal.draw(|frame| ui::draw(frame, &mut app))?;
if event::poll(Duration::from_millis(10))? { // if event::poll(Duration::from_millis(10))? {
if let Event::Key(key) = event::read()? { if let Event::Key(key) = event::read()? {
app.handle_key(key.code); app.handle_key(key.code).await;
}
} }
// }
if app.should_quit { if app.should_quit {
return Ok(app); return Ok(app);

View File

@ -54,7 +54,7 @@ fn serialize_savefile(
use clap::Parser; use clap::Parser;
mod crossterm; mod crossterm;
use std::error::Error; use std::{env, error::Error, fs::OpenOptions};
#[derive(Parser)] #[derive(Parser)]
#[command(name = "Brittling")] #[command(name = "Brittling")]
@ -62,13 +62,33 @@ use std::error::Error;
struct Args { struct Args {
#[arg(short, long)] #[arg(short, long)]
savefile: String, savefile: String,
#[arg(short, long, default_value = "/tmp/snowballing.log")]
logfile: String,
} }
fn main() -> Result<(), Box<dyn Error>> { #[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
let args = Args::parse(); let args = Args::parse();
if env::var("RUST_LOG").is_err() {
unsafe { env::set_var("RUST_LOG", "info") }
}
env_logger::Builder::from_default_env()
.format_module_path(false)
.target(env_logger::Target::Pipe(Box::new(
OpenOptions::new()
.create(true)
.append(true)
.open(args.logfile)
.unwrap(),
)))
.init();
let starting_app_state = deserialize_savefile(&args.savefile)?; let starting_app_state = deserialize_savefile(&args.savefile)?;
let final_app_state = crate::crossterm::run(starting_app_state)?; let final_app_state = crate::crossterm::run(starting_app_state).await?;
serialize_savefile(&final_app_state, &args.savefile)?; serialize_savefile(&final_app_state, &args.savefile)?;

View File

@ -1,7 +1,9 @@
use ammonia::Builder; use ammonia::Builder;
use html_escape::decode_html_entities;
use reqwest::Error; use reqwest::Error;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::{collections::HashMap}; use std::collections::HashMap;
use unicode_general_category::{GeneralCategory, get_general_category};
#[derive(Serialize, Deserialize, Debug)] #[derive(Serialize, Deserialize, Debug)]
pub struct Authorship { pub struct Authorship {
@ -55,15 +57,31 @@ impl Publication {
words_with_pos.sort_by_key(|k| k.0); words_with_pos.sort_by_key(|k| k.0);
let unsanitized = words_with_pos let raw_text = words_with_pos
.into_iter() .into_iter()
.map(|(_, word)| word.as_str()) .map(|(_, word)| word.as_str())
.collect::<Vec<_>>() .collect::<Vec<_>>()
.join(" "); .join(" ");
let cleaner = Builder::empty(); let cleaner = Builder::empty().clean(&raw_text).to_string();
let sanitized = cleaner.clean(&unsanitized).to_string(); let decoded = decode_html_entities(&cleaner);
sanitized.replace("\u{a0}", " ").trim().to_string()
let cleaned: String = decoded
.chars()
.filter(|&c| {
let cat = get_general_category(c);
!matches!(
cat,
GeneralCategory::Control
| GeneralCategory::Format
| GeneralCategory::Surrogate
| GeneralCategory::PrivateUse
| GeneralCategory::Unassigned
) || c.is_whitespace()
})
.collect();
cleaned.split_whitespace().collect::<Vec<_>>().join(" ")
}) })
} }
} }
@ -73,6 +91,24 @@ pub struct OpenAlexResponse {
pub results: Vec<Publication>, pub results: Vec<Publication>,
} }
pub async fn get_publication_by_id(
api_link: &str,
email: &str,
) -> Result<Publication, Error> {
let url = format!("{}&mailto={}", api_link, email);
let client = reqwest::Client::new();
let response = client
.get(url)
.header("User-Agent", "Rust-OpenAlex-Client/1.0")
.send()
.await?
.json::<Publication>()
.await?;
Ok(response)
}
// TODO: Get all papers, not just the first page // TODO: Get all papers, not just the first page
pub async fn get_citing_papers( pub async fn get_citing_papers(
target_id: &str, target_id: &str,