Start implementing snowballing functionality; add logging; properly sanitize abstracts

This commit is contained in:
2025-12-30 01:50:49 +02:00
parent 14f503a554
commit 4901a2897f
6 changed files with 230 additions and 21 deletions

View File

@@ -1,7 +1,11 @@
use core::panic;
use ratatui::{crossterm::event::KeyCode, widgets::ListState};
use serde::{Deserialize, Serialize};
use crate::snowballing::Publication;
use crate::snowballing::{Publication, get_publication_by_id};
use log::{debug, info, warn, error};
#[derive(Serialize, Deserialize, Default, PartialEq)]
pub enum ActivePane {
@@ -20,8 +24,8 @@ pub enum ActiveTab {
#[derive(Serialize, Deserialize, Default)]
pub enum SnowballingStep {
#[default]
Forward,
Backward,
Forward,
}
impl ToString for SnowballingStep {
@@ -104,14 +108,29 @@ pub struct App {
pub should_quit: bool,
}
// TODO: Implement exclusion and inclusion of papers (e.g., X and Y chars)
// TODO: Implement moving through steps and iterations (populating pending papers)
// TODO: Implement input of seed papers using IDs
// TODO: Implement exclusion and inclusion of papers (e.g., X and Y chars)
// TODO: Implement possibility of pushing excluded papers back into pending
// TODO: Implement export of included papers as csv for keywording with a spreadsheet
// TODO: Implement export of included papers into zotero (Use RIS format somehow)
impl App {
pub fn handle_key(&mut self, key: KeyCode) {
// TODO: Show error somehow
pub async fn add_seed_paper(&mut self, api_link: &String) {
let publ =
get_publication_by_id(api_link, "an.tsouchlos@gmail.com").await;
match publ {
Ok(publ) => self.included_publications.push(publ),
Err(err) => {
warn!(
"Failed to get publication metadata using OpenAlex API: {}",
err
);
}
}
}
pub async fn handle_key(&mut self, key: KeyCode) {
if KeyCode::Esc == key {
self.should_quit = true;
return;
@@ -123,7 +142,7 @@ impl App {
self.active_tab = ActiveTab::Snowballing;
}
KeyCode::Enter => {
// TODO: Actually add paper to included list
self.add_seed_paper(&self.seeding_input.clone()).await;
self.seeding_input.clear();
}
KeyCode::Char(to_insert) => self.seeding_input.push(to_insert),
@@ -136,6 +155,53 @@ impl App {
_ => {}
},
ActiveTab::Snowballing => match key {
KeyCode::Char(' ') => {
if self.pending_publications.len() > 0 {
warn!(
"The next snowballing step can only be initiated \
after screening all pending publications"
);
// TODO: Show warning/error somehow
return;
}
match self.snowballing_step {
SnowballingStep::Forward => {
// TODO: Implement
}
SnowballingStep::Backward => {
for publication in &self.included_publications {
for reference in &publication.referenced_works {
let api_link = format!(
"https://api.openalex.org/{}",
&reference[21..]
);
// https://openalex.org/W2085881930
let publ = get_publication_by_id(
&api_link,
"an.tsouchlos@gmail.com",
)
.await;
match publ {
Ok(publ) => {
self.pending_publications.push(publ)
}
// TODO: Show error somehow
Err(err) => {
warn!(
"Failed to get publication\
metadata using OpenAlex API: \
{}",
err
);
}
}
}
}
}
}
}
KeyCode::Enter => match self.active_pane {
ActivePane::IncludedPublications => {
if let Some(idx) = self.included_list_state.selected() {

View File

@@ -1,5 +1,6 @@
use std::{error::Error, io, time::Duration};
use log::{warn, error};
use ratatui::{
Terminal,
backend::{Backend, CrosstermBackend},
@@ -15,7 +16,7 @@ use ratatui::{
use crate::{app::App, ui};
pub fn run(app: App) -> Result<App, Box<dyn Error>> {
pub async fn run(app: App) -> Result<App, Box<dyn Error>> {
// setup terminal
enable_raw_mode()?;
let mut stdout = io::stdout();
@@ -24,7 +25,7 @@ pub fn run(app: App) -> Result<App, Box<dyn Error>> {
let mut terminal = Terminal::new(backend)?;
// create app and run it
let app_result = run_app(&mut terminal, app);
let app_result = run_app(&mut terminal, app).await;
// restore terminal
disable_raw_mode()?;
@@ -36,13 +37,14 @@ pub fn run(app: App) -> Result<App, Box<dyn Error>> {
terminal.show_cursor()?;
if let Err(err) = &app_result {
error!("{err:?}");
println!("{err:?}");
}
Ok(app_result?)
}
fn run_app<B: Backend>(
async fn run_app<B: Backend>(
terminal: &mut Terminal<B>,
mut app: App,
) -> io::Result<App>
@@ -52,11 +54,11 @@ where
loop {
terminal.draw(|frame| ui::draw(frame, &mut app))?;
if event::poll(Duration::from_millis(10))? {
if let Event::Key(key) = event::read()? {
app.handle_key(key.code);
}
// if event::poll(Duration::from_millis(10))? {
if let Event::Key(key) = event::read()? {
app.handle_key(key.code).await;
}
// }
if app.should_quit {
return Ok(app);

View File

@@ -54,7 +54,7 @@ fn serialize_savefile(
use clap::Parser;
mod crossterm;
use std::error::Error;
use std::{env, error::Error, fs::OpenOptions};
#[derive(Parser)]
#[command(name = "Brittling")]
@@ -62,13 +62,33 @@ use std::error::Error;
struct Args {
#[arg(short, long)]
savefile: String,
#[arg(short, long, default_value = "/tmp/snowballing.log")]
logfile: String,
}
fn main() -> Result<(), Box<dyn Error>> {
#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
let args = Args::parse();
if env::var("RUST_LOG").is_err() {
unsafe { env::set_var("RUST_LOG", "info") }
}
env_logger::Builder::from_default_env()
.format_module_path(false)
.target(env_logger::Target::Pipe(Box::new(
OpenOptions::new()
.create(true)
.append(true)
.open(args.logfile)
.unwrap(),
)))
.init();
let starting_app_state = deserialize_savefile(&args.savefile)?;
let final_app_state = crate::crossterm::run(starting_app_state)?;
let final_app_state = crate::crossterm::run(starting_app_state).await?;
serialize_savefile(&final_app_state, &args.savefile)?;

View File

@@ -1,7 +1,9 @@
use ammonia::Builder;
use html_escape::decode_html_entities;
use reqwest::Error;
use serde::{Deserialize, Serialize};
use std::{collections::HashMap};
use std::collections::HashMap;
use unicode_general_category::{GeneralCategory, get_general_category};
#[derive(Serialize, Deserialize, Debug)]
pub struct Authorship {
@@ -55,15 +57,31 @@ impl Publication {
words_with_pos.sort_by_key(|k| k.0);
let unsanitized = words_with_pos
let raw_text = words_with_pos
.into_iter()
.map(|(_, word)| word.as_str())
.collect::<Vec<_>>()
.join(" ");
let cleaner = Builder::empty();
let sanitized = cleaner.clean(&unsanitized).to_string();
sanitized.replace("\u{a0}", " ").trim().to_string()
let cleaner = Builder::empty().clean(&raw_text).to_string();
let decoded = decode_html_entities(&cleaner);
let cleaned: String = decoded
.chars()
.filter(|&c| {
let cat = get_general_category(c);
!matches!(
cat,
GeneralCategory::Control
| GeneralCategory::Format
| GeneralCategory::Surrogate
| GeneralCategory::PrivateUse
| GeneralCategory::Unassigned
) || c.is_whitespace()
})
.collect();
cleaned.split_whitespace().collect::<Vec<_>>().join(" ")
})
}
}
@@ -73,6 +91,24 @@ pub struct OpenAlexResponse {
pub results: Vec<Publication>,
}
pub async fn get_publication_by_id(
api_link: &str,
email: &str,
) -> Result<Publication, Error> {
let url = format!("{}&mailto={}", api_link, email);
let client = reqwest::Client::new();
let response = client
.get(url)
.header("User-Agent", "Rust-OpenAlex-Client/1.0")
.send()
.await?
.json::<Publication>()
.await?;
Ok(response)
}
// TODO: Get all papers, not just the first page
pub async fn get_citing_papers(
target_id: &str,