From 4901a2897fe7ffc3826b53419fc6b510d4107534 Mon Sep 17 00:00:00 2001 From: Andreas Tsouchlos Date: Tue, 30 Dec 2025 01:50:49 +0200 Subject: [PATCH] Start implementing snowballing functionality; add logging; properly sanitize abstracts --- Cargo.lock | 81 ++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 4 +++ src/app.rs | 78 ++++++++++++++++++++++++++++++++++++++++---- src/crossterm.rs | 16 +++++---- src/main.rs | 26 +++++++++++++-- src/snowballing.rs | 46 +++++++++++++++++++++++--- 6 files changed, 230 insertions(+), 21 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c54821b..1e31044 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -494,6 +494,29 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "env_filter" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bf3c259d255ca70051b30e2e95b5446cdb8949ac4cd22c0d7fd634d89f568e2" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "jiff", + "log", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -724,6 +747,15 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "html-escape" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476" +dependencies = [ + "utf8-width", +] + [[package]] name = "html5ever" version = "0.35.0" @@ -1059,6 +1091,30 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +[[package]] +name = "jiff" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a87d9b8105c23642f50cbbae03d1f75d8422c5cb98ce7ee9271f7ff7505be6b8" +dependencies = [ + "jiff-static", + "log", + "portable-atomic", + "portable-atomic-util", + "serde_core", +] + +[[package]] +name = "jiff-static" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b787bebb543f8969132630c51fd0afab173a86c6abae56ff3b9e5e3e3f9f6e58" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.111", +] + [[package]] name = "js-sys" version = "0.3.83" @@ -1549,6 +1605,15 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f89776e4d69bb58bc6993e99ffa1d11f228b839984854c7daeb5d37f87cbe950" +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + [[package]] name = "potential_utf" version = "0.1.4" @@ -1793,6 +1858,9 @@ dependencies = [ "ammonia", "clap", "crossterm", + "env_logger", + "html-escape", + "log", "open", "ratatui", "reqwest", @@ -1800,6 +1868,7 @@ dependencies = [ "serde_json", "textwrap", "tokio", + "unicode-general-category", ] [[package]] @@ -2495,6 +2564,12 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" +[[package]] +name = "unicode-general-category" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b993bddc193ae5bd0d623b49ec06ac3e9312875fdae725a975c51db1cc1677f" + [[package]] name = "unicode-ident" version = "1.0.22" @@ -2554,6 +2629,12 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf8-width" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1292c0d970b54115d14f2492fe0170adf21d68a1de108eebc51c1df4f346a091" + [[package]] name = "utf8_iter" version = "1.0.4" diff --git a/Cargo.toml b/Cargo.toml index c74dec2..a768164 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,9 @@ edition = "2024" ammonia = "4.1.2" clap = { version = "4.5.53", features = ["derive"] } crossterm = "0.29.0" +env_logger = "0.11.8" +html-escape = "0.2.13" +log = "0.4.29" open = "5.3.3" ratatui = "0.30.0" reqwest = { version = "0.12.28", features = ["json"] } @@ -14,3 +17,4 @@ serde = { version = "1.0.228", features = ["derive"] } serde_json = "1.0.148" textwrap = "0.16.2" tokio = { version = "1.48.0", features = ["full"] } +unicode-general-category = "1.1.0" diff --git a/src/app.rs b/src/app.rs index d67b1c9..93d90ea 100644 --- a/src/app.rs +++ b/src/app.rs @@ -1,7 +1,11 @@ +use core::panic; + use ratatui::{crossterm::event::KeyCode, widgets::ListState}; use serde::{Deserialize, Serialize}; -use crate::snowballing::Publication; +use crate::snowballing::{Publication, get_publication_by_id}; + +use log::{debug, info, warn, error}; #[derive(Serialize, Deserialize, Default, PartialEq)] pub enum ActivePane { @@ -20,8 +24,8 @@ pub enum ActiveTab { #[derive(Serialize, Deserialize, Default)] pub enum SnowballingStep { #[default] - Forward, Backward, + Forward, } impl ToString for SnowballingStep { @@ -104,14 +108,29 @@ pub struct App { pub should_quit: bool, } -// TODO: Implement exclusion and inclusion of papers (e.g., X and Y chars) // TODO: Implement moving through steps and iterations (populating pending papers) -// TODO: Implement input of seed papers using IDs +// TODO: Implement exclusion and inclusion of papers (e.g., X and Y chars) // TODO: Implement possibility of pushing excluded papers back into pending // TODO: Implement export of included papers as csv for keywording with a spreadsheet // TODO: Implement export of included papers into zotero (Use RIS format somehow) impl App { - pub fn handle_key(&mut self, key: KeyCode) { + // TODO: Show error somehow + pub async fn add_seed_paper(&mut self, api_link: &String) { + let publ = + get_publication_by_id(api_link, "an.tsouchlos@gmail.com").await; + + match publ { + Ok(publ) => self.included_publications.push(publ), + Err(err) => { + warn!( + "Failed to get publication metadata using OpenAlex API: {}", + err + ); + } + } + } + + pub async fn handle_key(&mut self, key: KeyCode) { if KeyCode::Esc == key { self.should_quit = true; return; @@ -123,7 +142,7 @@ impl App { self.active_tab = ActiveTab::Snowballing; } KeyCode::Enter => { - // TODO: Actually add paper to included list + self.add_seed_paper(&self.seeding_input.clone()).await; self.seeding_input.clear(); } KeyCode::Char(to_insert) => self.seeding_input.push(to_insert), @@ -136,6 +155,53 @@ impl App { _ => {} }, ActiveTab::Snowballing => match key { + KeyCode::Char(' ') => { + if self.pending_publications.len() > 0 { + warn!( + "The next snowballing step can only be initiated \ + after screening all pending publications" + ); + // TODO: Show warning/error somehow + return; + } + + match self.snowballing_step { + SnowballingStep::Forward => { + // TODO: Implement + } + SnowballingStep::Backward => { + for publication in &self.included_publications { + for reference in &publication.referenced_works { + let api_link = format!( + "https://api.openalex.org/{}", + &reference[21..] + ); + // https://openalex.org/W2085881930 + let publ = get_publication_by_id( + &api_link, + "an.tsouchlos@gmail.com", + ) + .await; + + match publ { + Ok(publ) => { + self.pending_publications.push(publ) + } + // TODO: Show error somehow + Err(err) => { + warn!( + "Failed to get publication\ + metadata using OpenAlex API: \ + {}", + err + ); + } + } + } + } + } + } + } KeyCode::Enter => match self.active_pane { ActivePane::IncludedPublications => { if let Some(idx) = self.included_list_state.selected() { diff --git a/src/crossterm.rs b/src/crossterm.rs index 7f2f592..1f04c61 100644 --- a/src/crossterm.rs +++ b/src/crossterm.rs @@ -1,5 +1,6 @@ use std::{error::Error, io, time::Duration}; +use log::{warn, error}; use ratatui::{ Terminal, backend::{Backend, CrosstermBackend}, @@ -15,7 +16,7 @@ use ratatui::{ use crate::{app::App, ui}; -pub fn run(app: App) -> Result> { +pub async fn run(app: App) -> Result> { // setup terminal enable_raw_mode()?; let mut stdout = io::stdout(); @@ -24,7 +25,7 @@ pub fn run(app: App) -> Result> { let mut terminal = Terminal::new(backend)?; // create app and run it - let app_result = run_app(&mut terminal, app); + let app_result = run_app(&mut terminal, app).await; // restore terminal disable_raw_mode()?; @@ -36,13 +37,14 @@ pub fn run(app: App) -> Result> { terminal.show_cursor()?; if let Err(err) = &app_result { + error!("{err:?}"); println!("{err:?}"); } Ok(app_result?) } -fn run_app( +async fn run_app( terminal: &mut Terminal, mut app: App, ) -> io::Result @@ -52,11 +54,11 @@ where loop { terminal.draw(|frame| ui::draw(frame, &mut app))?; - if event::poll(Duration::from_millis(10))? { - if let Event::Key(key) = event::read()? { - app.handle_key(key.code); - } + // if event::poll(Duration::from_millis(10))? { + if let Event::Key(key) = event::read()? { + app.handle_key(key.code).await; } + // } if app.should_quit { return Ok(app); diff --git a/src/main.rs b/src/main.rs index 4c6b87d..4254b09 100644 --- a/src/main.rs +++ b/src/main.rs @@ -54,7 +54,7 @@ fn serialize_savefile( use clap::Parser; mod crossterm; -use std::error::Error; +use std::{env, error::Error, fs::OpenOptions}; #[derive(Parser)] #[command(name = "Brittling")] @@ -62,13 +62,33 @@ use std::error::Error; struct Args { #[arg(short, long)] savefile: String, + + #[arg(short, long, default_value = "/tmp/snowballing.log")] + logfile: String, } -fn main() -> Result<(), Box> { +#[tokio::main] +async fn main() -> Result<(), Box> { let args = Args::parse(); + + if env::var("RUST_LOG").is_err() { + unsafe { env::set_var("RUST_LOG", "info") } + } + + env_logger::Builder::from_default_env() + .format_module_path(false) + .target(env_logger::Target::Pipe(Box::new( + OpenOptions::new() + .create(true) + .append(true) + .open(args.logfile) + .unwrap(), + ))) + .init(); + let starting_app_state = deserialize_savefile(&args.savefile)?; - let final_app_state = crate::crossterm::run(starting_app_state)?; + let final_app_state = crate::crossterm::run(starting_app_state).await?; serialize_savefile(&final_app_state, &args.savefile)?; diff --git a/src/snowballing.rs b/src/snowballing.rs index 185cb40..ca13f11 100644 --- a/src/snowballing.rs +++ b/src/snowballing.rs @@ -1,7 +1,9 @@ use ammonia::Builder; +use html_escape::decode_html_entities; use reqwest::Error; use serde::{Deserialize, Serialize}; -use std::{collections::HashMap}; +use std::collections::HashMap; +use unicode_general_category::{GeneralCategory, get_general_category}; #[derive(Serialize, Deserialize, Debug)] pub struct Authorship { @@ -55,15 +57,31 @@ impl Publication { words_with_pos.sort_by_key(|k| k.0); - let unsanitized = words_with_pos + let raw_text = words_with_pos .into_iter() .map(|(_, word)| word.as_str()) .collect::>() .join(" "); - let cleaner = Builder::empty(); - let sanitized = cleaner.clean(&unsanitized).to_string(); - sanitized.replace("\u{a0}", " ").trim().to_string() + let cleaner = Builder::empty().clean(&raw_text).to_string(); + let decoded = decode_html_entities(&cleaner); + + let cleaned: String = decoded + .chars() + .filter(|&c| { + let cat = get_general_category(c); + !matches!( + cat, + GeneralCategory::Control + | GeneralCategory::Format + | GeneralCategory::Surrogate + | GeneralCategory::PrivateUse + | GeneralCategory::Unassigned + ) || c.is_whitespace() + }) + .collect(); + + cleaned.split_whitespace().collect::>().join(" ") }) } } @@ -73,6 +91,24 @@ pub struct OpenAlexResponse { pub results: Vec, } +pub async fn get_publication_by_id( + api_link: &str, + email: &str, +) -> Result { + let url = format!("{}&mailto={}", api_link, email); + + let client = reqwest::Client::new(); + let response = client + .get(url) + .header("User-Agent", "Rust-OpenAlex-Client/1.0") + .send() + .await? + .json::() + .await?; + + Ok(response) +} + // TODO: Get all papers, not just the first page pub async fn get_citing_papers( target_id: &str,