brittling/src/snowballing.rs

97 lines
2.6 KiB
Rust

use ammonia::Builder;
use reqwest::Error;
use serde::{Deserialize, Serialize};
use std::{collections::HashMap};
#[derive(Serialize, Deserialize, Debug)]
pub struct Authorship {
pub author_position: String,
pub raw_author_name: String,
}
// TODO: Handle duplicates by having vectors of ids
#[derive(Serialize, Deserialize, Debug)]
pub struct Publication {
pub id: String,
pub display_name: Option<String>,
pub authorships: Vec<Authorship>,
pub publication_year: Option<u32>,
pub abstract_inverted_index: Option<HashMap<String, Vec<u32>>>,
pub referenced_works: Vec<String>,
}
impl Publication {
pub fn get_title(&self) -> Option<String> {
self.display_name.clone()
}
pub fn get_year(&self) -> Option<u32> {
self.publication_year
}
pub fn get_author_text(&self) -> String {
let mut author_str = self
.authorships
.first()
.map(|authorship| authorship.raw_author_name.clone())
.expect("Papers are required to always have at least one author");
if self.authorships.len() > 1 {
author_str.push_str(" et al.");
}
author_str
}
pub fn get_abstract(&self) -> Option<String> {
self.abstract_inverted_index.clone().map(|content| {
let mut words_with_pos: Vec<(u32, &String)> = Vec::new();
for (word, positions) in &content {
for pos in positions {
words_with_pos.push((*pos, word));
}
}
words_with_pos.sort_by_key(|k| k.0);
let unsanitized = words_with_pos
.into_iter()
.map(|(_, word)| word.as_str())
.collect::<Vec<_>>()
.join(" ");
let cleaner = Builder::empty();
let sanitized = cleaner.clean(&unsanitized).to_string();
sanitized.replace("\u{a0}", " ").trim().to_string()
})
}
}
#[derive(Serialize, Deserialize, Debug)]
pub struct OpenAlexResponse {
pub results: Vec<Publication>,
}
// TODO: Get all papers, not just the first page
pub async fn get_citing_papers(
target_id: &str,
email: &str,
) -> Result<Vec<Publication>, Error> {
let url = format!(
"https://api.openalex.org/works?filter=cites:{}&mailto={}",
target_id, email
);
let client = reqwest::Client::new();
let response = client
.get(url)
.header("User-Agent", "Rust-OpenAlex-Client/1.0")
.send()
.await?
.json::<OpenAlexResponse>()
.await?;
Ok(response.results)
}