Start implementing snowballing functionality; add logging; properly sanitize abstracts

This commit is contained in:
Andreas Tsouchlos 2025-12-30 01:50:49 +02:00
parent 14f503a554
commit 4901a2897f
6 changed files with 230 additions and 21 deletions

81
Cargo.lock generated
View File

@ -494,6 +494,29 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "env_filter"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bf3c259d255ca70051b30e2e95b5446cdb8949ac4cd22c0d7fd634d89f568e2"
dependencies = [
"log",
"regex",
]
[[package]]
name = "env_logger"
version = "0.11.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f"
dependencies = [
"anstream",
"anstyle",
"env_filter",
"jiff",
"log",
]
[[package]]
name = "equivalent"
version = "1.0.2"
@ -724,6 +747,15 @@ version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]]
name = "html-escape"
version = "0.2.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
dependencies = [
"utf8-width",
]
[[package]]
name = "html5ever"
version = "0.35.0"
@ -1059,6 +1091,30 @@ version = "1.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
[[package]]
name = "jiff"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a87d9b8105c23642f50cbbae03d1f75d8422c5cb98ce7ee9271f7ff7505be6b8"
dependencies = [
"jiff-static",
"log",
"portable-atomic",
"portable-atomic-util",
"serde_core",
]
[[package]]
name = "jiff-static"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b787bebb543f8969132630c51fd0afab173a86c6abae56ff3b9e5e3e3f9f6e58"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.111",
]
[[package]]
name = "js-sys"
version = "0.3.83"
@ -1549,6 +1605,15 @@ version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f89776e4d69bb58bc6993e99ffa1d11f228b839984854c7daeb5d37f87cbe950"
[[package]]
name = "portable-atomic-util"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507"
dependencies = [
"portable-atomic",
]
[[package]]
name = "potential_utf"
version = "0.1.4"
@ -1793,6 +1858,9 @@ dependencies = [
"ammonia",
"clap",
"crossterm",
"env_logger",
"html-escape",
"log",
"open",
"ratatui",
"reqwest",
@ -1800,6 +1868,7 @@ dependencies = [
"serde_json",
"textwrap",
"tokio",
"unicode-general-category",
]
[[package]]
@ -2495,6 +2564,12 @@ version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971"
[[package]]
name = "unicode-general-category"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b993bddc193ae5bd0d623b49ec06ac3e9312875fdae725a975c51db1cc1677f"
[[package]]
name = "unicode-ident"
version = "1.0.22"
@ -2554,6 +2629,12 @@ version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "utf8-width"
version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1292c0d970b54115d14f2492fe0170adf21d68a1de108eebc51c1df4f346a091"
[[package]]
name = "utf8_iter"
version = "1.0.4"

View File

@ -7,6 +7,9 @@ edition = "2024"
ammonia = "4.1.2"
clap = { version = "4.5.53", features = ["derive"] }
crossterm = "0.29.0"
env_logger = "0.11.8"
html-escape = "0.2.13"
log = "0.4.29"
open = "5.3.3"
ratatui = "0.30.0"
reqwest = { version = "0.12.28", features = ["json"] }
@ -14,3 +17,4 @@ serde = { version = "1.0.228", features = ["derive"] }
serde_json = "1.0.148"
textwrap = "0.16.2"
tokio = { version = "1.48.0", features = ["full"] }
unicode-general-category = "1.1.0"

View File

@ -1,7 +1,11 @@
use core::panic;
use ratatui::{crossterm::event::KeyCode, widgets::ListState};
use serde::{Deserialize, Serialize};
use crate::snowballing::Publication;
use crate::snowballing::{Publication, get_publication_by_id};
use log::{debug, info, warn, error};
#[derive(Serialize, Deserialize, Default, PartialEq)]
pub enum ActivePane {
@ -20,8 +24,8 @@ pub enum ActiveTab {
#[derive(Serialize, Deserialize, Default)]
pub enum SnowballingStep {
#[default]
Forward,
Backward,
Forward,
}
impl ToString for SnowballingStep {
@ -104,14 +108,29 @@ pub struct App {
pub should_quit: bool,
}
// TODO: Implement exclusion and inclusion of papers (e.g., X and Y chars)
// TODO: Implement moving through steps and iterations (populating pending papers)
// TODO: Implement input of seed papers using IDs
// TODO: Implement exclusion and inclusion of papers (e.g., X and Y chars)
// TODO: Implement possibility of pushing excluded papers back into pending
// TODO: Implement export of included papers as csv for keywording with a spreadsheet
// TODO: Implement export of included papers into zotero (Use RIS format somehow)
impl App {
pub fn handle_key(&mut self, key: KeyCode) {
// TODO: Show error somehow
pub async fn add_seed_paper(&mut self, api_link: &String) {
let publ =
get_publication_by_id(api_link, "an.tsouchlos@gmail.com").await;
match publ {
Ok(publ) => self.included_publications.push(publ),
Err(err) => {
warn!(
"Failed to get publication metadata using OpenAlex API: {}",
err
);
}
}
}
pub async fn handle_key(&mut self, key: KeyCode) {
if KeyCode::Esc == key {
self.should_quit = true;
return;
@ -123,7 +142,7 @@ impl App {
self.active_tab = ActiveTab::Snowballing;
}
KeyCode::Enter => {
// TODO: Actually add paper to included list
self.add_seed_paper(&self.seeding_input.clone()).await;
self.seeding_input.clear();
}
KeyCode::Char(to_insert) => self.seeding_input.push(to_insert),
@ -136,6 +155,53 @@ impl App {
_ => {}
},
ActiveTab::Snowballing => match key {
KeyCode::Char(' ') => {
if self.pending_publications.len() > 0 {
warn!(
"The next snowballing step can only be initiated \
after screening all pending publications"
);
// TODO: Show warning/error somehow
return;
}
match self.snowballing_step {
SnowballingStep::Forward => {
// TODO: Implement
}
SnowballingStep::Backward => {
for publication in &self.included_publications {
for reference in &publication.referenced_works {
let api_link = format!(
"https://api.openalex.org/{}",
&reference[21..]
);
// https://openalex.org/W2085881930
let publ = get_publication_by_id(
&api_link,
"an.tsouchlos@gmail.com",
)
.await;
match publ {
Ok(publ) => {
self.pending_publications.push(publ)
}
// TODO: Show error somehow
Err(err) => {
warn!(
"Failed to get publication\
metadata using OpenAlex API: \
{}",
err
);
}
}
}
}
}
}
}
KeyCode::Enter => match self.active_pane {
ActivePane::IncludedPublications => {
if let Some(idx) = self.included_list_state.selected() {

View File

@ -1,5 +1,6 @@
use std::{error::Error, io, time::Duration};
use log::{warn, error};
use ratatui::{
Terminal,
backend::{Backend, CrosstermBackend},
@ -15,7 +16,7 @@ use ratatui::{
use crate::{app::App, ui};
pub fn run(app: App) -> Result<App, Box<dyn Error>> {
pub async fn run(app: App) -> Result<App, Box<dyn Error>> {
// setup terminal
enable_raw_mode()?;
let mut stdout = io::stdout();
@ -24,7 +25,7 @@ pub fn run(app: App) -> Result<App, Box<dyn Error>> {
let mut terminal = Terminal::new(backend)?;
// create app and run it
let app_result = run_app(&mut terminal, app);
let app_result = run_app(&mut terminal, app).await;
// restore terminal
disable_raw_mode()?;
@ -36,13 +37,14 @@ pub fn run(app: App) -> Result<App, Box<dyn Error>> {
terminal.show_cursor()?;
if let Err(err) = &app_result {
error!("{err:?}");
println!("{err:?}");
}
Ok(app_result?)
}
fn run_app<B: Backend>(
async fn run_app<B: Backend>(
terminal: &mut Terminal<B>,
mut app: App,
) -> io::Result<App>
@ -52,11 +54,11 @@ where
loop {
terminal.draw(|frame| ui::draw(frame, &mut app))?;
if event::poll(Duration::from_millis(10))? {
if let Event::Key(key) = event::read()? {
app.handle_key(key.code);
}
// if event::poll(Duration::from_millis(10))? {
if let Event::Key(key) = event::read()? {
app.handle_key(key.code).await;
}
// }
if app.should_quit {
return Ok(app);

View File

@ -54,7 +54,7 @@ fn serialize_savefile(
use clap::Parser;
mod crossterm;
use std::error::Error;
use std::{env, error::Error, fs::OpenOptions};
#[derive(Parser)]
#[command(name = "Brittling")]
@ -62,13 +62,33 @@ use std::error::Error;
struct Args {
#[arg(short, long)]
savefile: String,
#[arg(short, long, default_value = "/tmp/snowballing.log")]
logfile: String,
}
fn main() -> Result<(), Box<dyn Error>> {
#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
let args = Args::parse();
if env::var("RUST_LOG").is_err() {
unsafe { env::set_var("RUST_LOG", "info") }
}
env_logger::Builder::from_default_env()
.format_module_path(false)
.target(env_logger::Target::Pipe(Box::new(
OpenOptions::new()
.create(true)
.append(true)
.open(args.logfile)
.unwrap(),
)))
.init();
let starting_app_state = deserialize_savefile(&args.savefile)?;
let final_app_state = crate::crossterm::run(starting_app_state)?;
let final_app_state = crate::crossterm::run(starting_app_state).await?;
serialize_savefile(&final_app_state, &args.savefile)?;

View File

@ -1,7 +1,9 @@
use ammonia::Builder;
use html_escape::decode_html_entities;
use reqwest::Error;
use serde::{Deserialize, Serialize};
use std::{collections::HashMap};
use std::collections::HashMap;
use unicode_general_category::{GeneralCategory, get_general_category};
#[derive(Serialize, Deserialize, Debug)]
pub struct Authorship {
@ -55,15 +57,31 @@ impl Publication {
words_with_pos.sort_by_key(|k| k.0);
let unsanitized = words_with_pos
let raw_text = words_with_pos
.into_iter()
.map(|(_, word)| word.as_str())
.collect::<Vec<_>>()
.join(" ");
let cleaner = Builder::empty();
let sanitized = cleaner.clean(&unsanitized).to_string();
sanitized.replace("\u{a0}", " ").trim().to_string()
let cleaner = Builder::empty().clean(&raw_text).to_string();
let decoded = decode_html_entities(&cleaner);
let cleaned: String = decoded
.chars()
.filter(|&c| {
let cat = get_general_category(c);
!matches!(
cat,
GeneralCategory::Control
| GeneralCategory::Format
| GeneralCategory::Surrogate
| GeneralCategory::PrivateUse
| GeneralCategory::Unassigned
) || c.is_whitespace()
})
.collect();
cleaned.split_whitespace().collect::<Vec<_>>().join(" ")
})
}
}
@ -73,6 +91,24 @@ pub struct OpenAlexResponse {
pub results: Vec<Publication>,
}
pub async fn get_publication_by_id(
api_link: &str,
email: &str,
) -> Result<Publication, Error> {
let url = format!("{}&mailto={}", api_link, email);
let client = reqwest::Client::new();
let response = client
.get(url)
.header("User-Agent", "Rust-OpenAlex-Client/1.0")
.send()
.await?
.json::<Publication>()
.await?;
Ok(response)
}
// TODO: Get all papers, not just the first page
pub async fn get_citing_papers(
target_id: &str,