183 lines
5.1 KiB
Rust
183 lines
5.1 KiB
Rust
use ammonia::Builder;
|
|
use html_escape::decode_html_entities;
|
|
use reqwest::Error;
|
|
use serde::{Deserialize, Serialize};
|
|
use std::collections::HashMap;
|
|
use unicode_general_category::{GeneralCategory, get_general_category};
|
|
|
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
|
pub struct Authorship {
|
|
pub author_position: String,
|
|
pub raw_author_name: String,
|
|
}
|
|
|
|
// TODO: Handle duplicates by having vectors of ids
|
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
|
pub struct Publication {
|
|
pub id: String,
|
|
pub display_name: Option<String>,
|
|
pub authorships: Vec<Authorship>,
|
|
pub publication_year: Option<u32>,
|
|
pub abstract_inverted_index: Option<HashMap<String, Vec<u32>>>,
|
|
pub referenced_works: Vec<String>,
|
|
}
|
|
|
|
#[derive(Serialize, Deserialize, Default)]
|
|
pub enum SnowballingStep {
|
|
#[default]
|
|
Backward,
|
|
Forward,
|
|
}
|
|
|
|
impl ToString for SnowballingStep {
|
|
fn to_string(&self) -> String {
|
|
match self {
|
|
SnowballingStep::Forward => String::from("forward"),
|
|
SnowballingStep::Backward => String::from("backward"),
|
|
}
|
|
}
|
|
}
|
|
|
|
// TODO: Only store IDs of excluded publications?
|
|
#[derive(Serialize, Deserialize, Default)]
|
|
pub struct SnowballingIteration {
|
|
pub included_publications: Vec<Publication>,
|
|
pub excluded_publications: Vec<Publication>,
|
|
pub step: SnowballingStep,
|
|
}
|
|
|
|
#[derive(Serialize, Deserialize, Default)]
|
|
pub struct SnowballingHistory {
|
|
pub seed: Vec<Publication>,
|
|
pub current_iteration: SnowballingIteration,
|
|
pub previoius_iterations: Vec<SnowballingIteration>,
|
|
pub pending_publications: Vec<Publication>,
|
|
}
|
|
|
|
impl SnowballingHistory {
|
|
pub fn get_all_included(&self) -> Vec<Publication> {
|
|
vec![self.current_iteration.included_publications.clone()]
|
|
.into_iter()
|
|
.chain(
|
|
self.previoius_iterations
|
|
.iter()
|
|
.map(|iter| iter.included_publications.clone()),
|
|
)
|
|
.flatten()
|
|
.collect()
|
|
}
|
|
|
|
pub fn get_all_pending(&self) -> Vec<Publication> {
|
|
self.pending_publications.clone()
|
|
}
|
|
}
|
|
|
|
impl Publication {
|
|
pub fn get_title(&self) -> Option<String> {
|
|
self.display_name.clone()
|
|
}
|
|
|
|
pub fn get_year(&self) -> Option<u32> {
|
|
self.publication_year
|
|
}
|
|
|
|
pub fn get_author_text(&self) -> String {
|
|
let mut author_str = self
|
|
.authorships
|
|
.first()
|
|
.map(|authorship| authorship.raw_author_name.clone())
|
|
.expect("Papers are required to always have at least one author");
|
|
|
|
if self.authorships.len() > 1 {
|
|
author_str.push_str(" et al.");
|
|
}
|
|
|
|
author_str
|
|
}
|
|
|
|
pub fn get_abstract(&self) -> Option<String> {
|
|
self.abstract_inverted_index.clone().map(|content| {
|
|
let mut words_with_pos: Vec<(u32, &String)> = Vec::new();
|
|
|
|
for (word, positions) in &content {
|
|
for pos in positions {
|
|
words_with_pos.push((*pos, word));
|
|
}
|
|
}
|
|
|
|
words_with_pos.sort_by_key(|k| k.0);
|
|
|
|
let raw_text = words_with_pos
|
|
.into_iter()
|
|
.map(|(_, word)| word.as_str())
|
|
.collect::<Vec<_>>()
|
|
.join(" ");
|
|
|
|
let cleaner = Builder::empty().clean(&raw_text).to_string();
|
|
let decoded = decode_html_entities(&cleaner);
|
|
|
|
let cleaned: String = decoded
|
|
.chars()
|
|
.filter(|&c| {
|
|
let cat = get_general_category(c);
|
|
!matches!(
|
|
cat,
|
|
GeneralCategory::Control
|
|
| GeneralCategory::Format
|
|
| GeneralCategory::Surrogate
|
|
| GeneralCategory::PrivateUse
|
|
| GeneralCategory::Unassigned
|
|
) || c.is_whitespace()
|
|
})
|
|
.collect();
|
|
|
|
cleaned.split_whitespace().collect::<Vec<_>>().join(" ")
|
|
})
|
|
}
|
|
}
|
|
|
|
// #[derive(Serialize, Deserialize, Debug)]
|
|
// pub struct OpenAlexResponse {
|
|
// pub results: Vec<Publication>,
|
|
// }
|
|
|
|
pub async fn get_publication_by_id(
|
|
api_link: &str,
|
|
email: &str,
|
|
) -> Result<Publication, Error> {
|
|
let url = format!("{}&mailto={}", api_link, email);
|
|
|
|
let client = reqwest::Client::new();
|
|
let response = client
|
|
.get(url)
|
|
.header("User-Agent", "Rust-OpenAlex-Client/1.0")
|
|
.send()
|
|
.await?
|
|
.json::<Publication>()
|
|
.await?;
|
|
|
|
Ok(response)
|
|
}
|
|
|
|
// // TODO: Get all papers, not just the first page
|
|
// pub async fn get_citing_papers(
|
|
// target_id: &str,
|
|
// email: &str,
|
|
// ) -> Result<Vec<Publication>, Error> {
|
|
// let url = format!(
|
|
// "https://api.openalex.org/works?filter=cites:{}&mailto={}",
|
|
// target_id, email
|
|
// );
|
|
//
|
|
// let client = reqwest::Client::new();
|
|
// let response = client
|
|
// .get(url)
|
|
// .header("User-Agent", "Rust-OpenAlex-Client/1.0")
|
|
// .send()
|
|
// .await?
|
|
// .json::<OpenAlexResponse>()
|
|
// .await?;
|
|
//
|
|
// Ok(response.results)
|
|
// }
|