Implement fetching references with future streams

This commit is contained in:
Andreas Tsouchlos 2026-01-01 05:33:32 +02:00
parent 0ba4e21ae3
commit 143e177b8c
6 changed files with 265 additions and 53 deletions

51
Cargo.lock generated
View File

@ -158,6 +158,7 @@ dependencies = [
"clap",
"crossterm",
"env_logger",
"futures",
"html-escape",
"log",
"open",
@ -663,6 +664,21 @@ dependencies = [
"new_debug_unreachable",
]
[[package]]
name = "futures"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.31"
@ -670,6 +686,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
dependencies = [
"futures-core",
"futures-sink",
]
[[package]]
@ -678,6 +695,34 @@ version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
[[package]]
name = "futures-executor"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-io"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
[[package]]
name = "futures-macro"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.112",
]
[[package]]
name = "futures-sink"
version = "0.3.31"
@ -696,10 +741,16 @@ version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
dependencies = [
"futures-channel",
"futures-core",
"futures-io",
"futures-macro",
"futures-sink",
"futures-task",
"memchr",
"pin-project-lite",
"pin-utils",
"slab",
]
[[package]]

View File

@ -23,3 +23,4 @@ textwrap = "0.16.2"
tokio = { version = "1.48.0", features = ["full"] }
unicode-general-category = "1.1.0"
brittling_macros = { path = "macros" }
futures = "0.3.31"

View File

@ -3,6 +3,7 @@ pub mod run;
pub mod seeding;
pub mod snowballing;
use futures::StreamExt;
use log::{error, info, warn};
use ratatui::crossterm::event::KeyCode;
use serde::{Deserialize, Serialize};
@ -14,7 +15,10 @@ use tokio::{
use crate::{
app::run::Action,
literature::{Publication, SnowballingHistory, get_publication_by_id},
literature::{
Publication, SnowballingHistory, get_publication_by_id,
get_references_stream,
},
status_error, status_info,
};
use brittling_macros::component;
@ -134,9 +138,16 @@ impl App {
.push(publ.clone());
}
// TODO: Is deduplication necessary here?
#[action]
fn fetch_pub(
fn add_pending_pub(&mut self, publ: Publication) {
self.state.history.pending_publications.push(publ.clone());
}
#[action]
fn fetch_and_include_seed(
&self,
link: String,
action_tx: &'static UnboundedSender<Action>,
) -> Result<(), SendError<Action>> {
if !self
@ -160,15 +171,15 @@ impl App {
let api_link = format!(
"https://api.openalex.org/{}",
self.state
.seeding
.input
.trim_start_matches("https://openalex.org/")
link.trim_start_matches("https://openalex.org/")
);
tokio::spawn(async move {
let publ =
get_publication_by_id(&api_link, "an.tsouchlos@gmail.com");
let publ = get_publication_by_id(
api_link.into(),
"an.tsouchlos@gmail.com".to_string().into(),
None,
);
match publ.await {
Ok(publ) => {
@ -191,13 +202,42 @@ impl App {
action_tx.send(SeedingAction::ClearInput.into())
}
// TODO: Implement
#[action]
fn fetch(
fn fetch_references(
&self,
action_tx: &'static UnboundedSender<Action>,
) -> Result<(), SendError<Action>> {
status_info!(action_tx, "Fetch action triggered")
status_info!(action_tx, "Fetching references...")?;
let included = self.state.history.get_all_included();
tokio::spawn(async move {
let mut stream = get_references_stream(
included,
"an.tsouchlos@gmail.com".to_string(),
);
while let Some(result) = stream.next().await {
match result {
Ok(vals) => {
for val in vals {
let _ = action_tx
.send(GlobalAction::AddPendingPub(val).into());
}
}
Err(err) => {
let _ = status_error!(
action_tx,
"Error loading reference: {}",
err
);
}
}
}
let _ = status_info!(action_tx, "Done fetching references");
});
Ok(())
}
pub fn handle_key(
@ -214,9 +254,12 @@ impl App {
(Tab::Seeding, KeyCode::Backspace) => {
action_tx.send(SeedingAction::EnterBackspace.into())
}
(Tab::Seeding, KeyCode::Enter) => {
action_tx.send(GlobalAction::FetchPub.into())
}
(Tab::Seeding, KeyCode::Enter) => action_tx.send(
GlobalAction::FetchAndIncludeSeed(
self.state.seeding.input.clone(),
)
.into(),
),
(Tab::Snowballing, KeyCode::Enter) => {
action_tx.send(SnowballingAction::Search.into())
}
@ -233,7 +276,7 @@ impl App {
action_tx.send(SnowballingAction::PrevItem.into())
}
(Tab::Snowballing, KeyCode::Char(' ')) => {
action_tx.send(GlobalAction::Fetch.into())
action_tx.send(GlobalAction::FetchReferences.into())
}
_ => Ok(()),
}

View File

@ -28,7 +28,7 @@ macro_rules! status_warn {
($action_tx:expr, $text:expr $(, $args:expr)*) => {
$action_tx.send(
crate::app::GlobalAction::ShowStatMsg(
crate::app::StatusMessage::Info(format!($text, $($args)*)))
crate::app::StatusMessage::Warn(format!($text, $($args)*)))
.into(),
)
};
@ -40,7 +40,7 @@ macro_rules! status_error {
($action_tx:expr, $text:expr $(, $args:expr)*) => {
$action_tx.send(
crate::app::GlobalAction::ShowStatMsg(
crate::app::StatusMessage::Info(format!($text, $($args)*)))
crate::app::StatusMessage::Error(format!($text $(, $args)*)))
.into(),
)
};

View File

@ -1,6 +1,10 @@
use serde::{Deserialize, Serialize};
use tokio::sync::mpsc::{UnboundedSender, error::SendError};
use crate::literature::Publication;
use crate::{
app::{GlobalAction, run::Action},
literature::Publication,
};
use brittling_macros::component;
#[derive(Serialize, Deserialize, Default)]
@ -12,9 +16,18 @@ pub struct SeedingComponent {
#[component(SeedingAction)]
impl SeedingComponent {
#[action]
pub fn submit(
&mut self,
action_tx: &UnboundedSender<Action>,
) -> Result<(), SendError<Action>> {
action_tx
.send(GlobalAction::FetchAndIncludeSeed(self.input.clone()).into())
}
#[action]
pub fn clear_input(&mut self) {
self.input.clear()
self.input.clear();
}
#[action]

View File

@ -1,8 +1,9 @@
use ammonia::Builder;
use futures::{StreamExt, future::BoxFuture, stream};
use html_escape::decode_html_entities;
use reqwest::Error;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::{collections::HashMap, fmt::Display, sync::Arc};
use tokio::task::JoinSet;
use unicode_general_category::{GeneralCategory, get_general_category};
#[derive(Serialize, Deserialize, Debug, Clone)]
@ -72,9 +73,35 @@ impl SnowballingHistory {
}
}
fn sanitize_text(raw_text: &str) -> String {
let cleaner = Builder::empty().clean(&raw_text).to_string();
let decoded = decode_html_entities(&cleaner);
let cleaned: String = decoded
.chars()
.filter(|&c| {
let cat = get_general_category(c);
!matches!(
cat,
GeneralCategory::Control
| GeneralCategory::Format
| GeneralCategory::Surrogate
| GeneralCategory::PrivateUse
| GeneralCategory::Unassigned
) || c.is_whitespace()
})
.collect();
cleaned
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
.replace("\\n", " ")
}
impl Publication {
pub fn get_title(&self) -> Option<String> {
self.display_name.clone()
self.display_name.clone().map(|s| sanitize_text(&s))
}
pub fn get_year(&self) -> Option<u32> {
@ -113,50 +140,127 @@ impl Publication {
.collect::<Vec<_>>()
.join(" ");
let cleaner = Builder::empty().clean(&raw_text).to_string();
let decoded = decode_html_entities(&cleaner);
let cleaned: String = decoded
.chars()
.filter(|&c| {
let cat = get_general_category(c);
!matches!(
cat,
GeneralCategory::Control
| GeneralCategory::Format
| GeneralCategory::Surrogate
| GeneralCategory::PrivateUse
| GeneralCategory::Unassigned
) || c.is_whitespace()
})
.collect();
cleaned.split_whitespace().collect::<Vec<_>>().join(" ")
sanitize_text(&raw_text)
})
}
}
// #[derive(Serialize, Deserialize, Debug)]
// pub struct OpenAlexResponse {
// pub results: Vec<Publication>,
// }
#[derive(Serialize, Deserialize, Debug)]
pub struct OpenAlexResponse {
pub results: Vec<Publication>,
}
#[derive(Debug)]
pub enum Error {
ApiError(String),
}
impl Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Error::ApiError(s) => write!(
f,
"An error occurred while attempting to access the OpenAlex \
API: url={}",
s
),
}
}
}
pub async fn get_publication_by_id(
api_link: &str,
email: &str,
api_link: Arc<String>,
email: Arc<String>,
client: Option<reqwest::Client>,
) -> Result<Publication, Error> {
let url = format!("{}&mailto={}", api_link, email);
let client = reqwest::Client::new();
let response = client
.get(url)
let client = client.or(Some(reqwest::Client::new())).unwrap();
client
.get(url.clone())
.header("User-Agent", "Rust-OpenAlex-Client/1.0")
.send()
.await?
.await
.map_err(|_| Error::ApiError(url.clone()))?
.json::<Publication>()
.await?;
.await
.map_err(|_| Error::ApiError(url))
}
Ok(response)
// TODO: Get all publications, not just the first 25
pub fn get_references_stream(
publications: Vec<Publication>,
email: String,
) -> impl futures::Stream<Item = Result<Vec<Publication>, Error>> {
let email = Arc::new(email);
let client = reqwest::Client::new();
let mut publication_ids = Vec::<String>::new();
let mut referenced_work_urls = Vec::<String>::new();
for p in &publications {
publication_ids
.push(p.id.trim_start_matches("https://openalex.org/").to_string());
referenced_work_urls.append(
&mut p
.referenced_works
.iter()
.map(|r| {
format!(
"https://api.openalex.org/{}",
r.trim_start_matches("https://openalex.org/")
)
})
.collect::<Vec<String>>(),
);
}
// Get references using the referenced_works field
let stream1 = stream::iter(referenced_work_urls).map({
let (email, client) = (email.clone(), client.clone());
move |url| {
let (email, client) = (email.clone(), client.clone());
let fut: BoxFuture<'static, Result<Vec<Publication>, Error>> =
Box::pin(async move {
get_publication_by_id(url.into(), email, Some(client))
.await
.map(|val| vec![val])
});
fut
}
});
// Search for references using API calls
let stream2 = stream::iter(publication_ids).map(move |id| {
let url = format!(
"https://api.openalex.org/works?filter=cites:{}&mailto={}",
id, email
);
let client = client.clone();
let fut: BoxFuture<'static, Result<Vec<Publication>, Error>> =
Box::pin(async move {
client
.get(url.clone())
.header("User-Agent", "Rust-OpenAlex-Client/1.0")
.send()
.await
.map_err(|_| Error::ApiError(url.clone()))?
.json::<OpenAlexResponse>()
.await
.map(|response| response.results)
.map_err(|_| Error::ApiError(url.clone()))
});
fut
});
// Combine the two streams
stream::select(stream1, stream2).buffer_unordered(10)
}
// // TODO: Get all papers, not just the first page