SHA256
1
0

Initial commit

This commit is contained in:
Alex Wied
2026-06-08 19:33:49 -04:00
commit 6906fb973a
24 changed files with 9382 additions and 0 deletions
+95
View File
@@ -0,0 +1,95 @@
use std::io::{BufReader, Read};
use tantivy::{doc, TantivyDocument};
use xml::EventReader;
use xml::reader::XmlEvent;
use crate::Schema;
/// Usage:
/// ```
/// let file = File::open("./EnglishNKJBible.xml")?;
/// let documents = gl_search::BibleXmlConverter::xml_to_tantivy_documents(9, "https://graphofliberty.org/entity/E0", file);
/// let mut output = File::create("./EnglishNKJBible.jsonl")?;
/// for document in documents {
/// let json = document.to_json(Schema::schema());
/// output.write(json.as_bytes())?;
/// output.write("\n".as_bytes())?;
/// }
/// ```
pub struct BibleXmlConverter {}
impl BibleXmlConverter {
pub fn xml_to_tantivy_documents<R: Read>(type_: u64, iri: &str, reader: R) -> Vec<TantivyDocument> {
let file = BufReader::new(reader);
let parser = EventReader::new(file);
let mut documents = vec![];
let mut book: Option<u64> = None;
let mut chapter: Option<u64> = None;
let mut verse: Option<u64> = None;
let mut content: Option<String> = None;
for event in parser {
match event {
Ok(XmlEvent::StartElement { name, attributes, .. }) => {
match name.local_name.as_str() {
"book" => {
for attr in attributes {
if attr.name.local_name == "number" {
book = attr.value.parse().ok();
}
}
},
"chapter" => {
for attr in attributes {
if attr.name.local_name == "number" {
chapter = attr.value.parse().ok();
}
}
},
"verse" => {
for attr in attributes {
if attr.name.local_name == "number" {
verse = attr.value.parse().ok();
}
}
}
_ => {}
}
}
Ok(XmlEvent::Characters(characters)) => {
content = Some(characters);
}
Ok(XmlEvent::EndElement { name }) => {
match name.local_name.as_str() {
"book" => {
book = None;
}
"chapter" => {
chapter = None;
}
"verse" => {
if let Some(book) = book && let Some(chapter) = chapter && let Some(verse) = verse && let Some(ref content) = content {
let document = doc!(
Schema::type_field() => type_,
Schema::iri_field() => iri,
Schema::book_field() => book,
Schema::chapter_field() => chapter,
Schema::verse_field() => verse,
Schema::content_field() => content.as_str(),
);
documents.push(document);
}
verse = None;
}
_ => {}
}
}
Err(e) => {
eprintln!("Error: {e}");
break;
}
_ => {}
}
}
documents
}
}
+17
View File
@@ -0,0 +1,17 @@
use thiserror::Error;
pub type Result<R> = std::result::Result<R, SearchError>;
#[derive(Debug, Error)]
pub enum SearchError {
#[error(transparent)]
TantivyError(#[from] tantivy::TantivyError),
#[error(transparent)]
DocumentParsingError(#[from] tantivy::schema::DocParsingError),
#[error(transparent)]
OpenDirectoryError(#[from] tantivy::directory::error::OpenDirectoryError),
#[error("Path to index not specified")]
IndexPathNotSpecified,
}
+132
View File
@@ -0,0 +1,132 @@
use crate::error;
use crate::error::SearchError;
use crate::schema::Schema;
use std::path::PathBuf;
use tantivy::collector::TopDocs;
use tantivy::directory::{ManagedDirectory, MmapDirectory};
use tantivy::query::{BooleanQuery, Occur, QueryParser, TermQuery};
use tantivy::schema::{Field, IndexRecordOption, Value};
use tantivy::tokenizer::{NgramTokenizer, TokenizerManager};
use tantivy::{Index, IndexReader, IndexWriter, ReloadPolicy, TantivyDocument, Term, doc};
#[derive(Default)]
pub struct SearchIndexBuilder {
path: Option<PathBuf>,
}
impl SearchIndexBuilder {
pub fn with_path(mut self, path: impl Into<PathBuf>) -> Self {
self.path = Some(path.into());
self
}
pub fn build(self) -> error::Result<SearchIndex> {
if let Some(path) = self.path {
let ngram_32 = NgramTokenizer::new(1, 32, false)?;
let tokenizer_manager = TokenizerManager::default();
tokenizer_manager.register("ngram_32", ngram_32);
let mmap_directory = MmapDirectory::open(path)?;
let managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory))?;
let index = Index::builder()
.schema(Schema::schema().clone())
.tokenizers(tokenizer_manager)
.open_or_create(managed_directory)?;
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.try_into()?;
let writer = index.writer(50_000_000)?;
Ok(SearchIndex {
index,
reader,
writer,
})
} else {
Err(SearchError::IndexPathNotSpecified)
}
}
}
pub struct SearchIndex {
index: Index,
reader: IndexReader,
writer: IndexWriter,
}
impl SearchIndex {
pub fn builder() -> SearchIndexBuilder {
SearchIndexBuilder::default()
}
pub fn remove_all_annotated_iris(&mut self) {
self.writer
.delete_term(Term::from_field_u64(Schema::type_field(), 0u64));
self.writer.commit().unwrap();
}
pub fn add_annotated_iri(
&self,
iri: &str,
label: Option<&str>,
comment: Option<&str>,
) -> crate::Result<()> {
let mut document = doc!(
Schema::type_field() => 0u64,
Schema::iri_field() => iri,
);
if let Some(label) = label {
document.add_text(Schema::label_field(), label.to_lowercase());
}
if let Some(comment) = comment {
document.add_text(Schema::comment_field(), comment);
}
self.writer.add_document(document)?;
Ok(())
}
pub fn add<'a>(&self, document: TantivyDocument) -> crate::Result<()> {
self.writer.add_document(document)?;
Ok(())
}
pub fn commit(&mut self) -> crate::Result<()> {
self.writer.commit()?;
Ok(())
}
pub fn query(
&self,
type_: u64,
user_query: &str,
default_fields: Vec<Field>,
) -> error::Result<Vec<String>> {
let doc_type_term = Term::from_field_u64(Schema::type_field(), type_);
let doc_type_query = Box::new(TermQuery::new(doc_type_term, IndexRecordOption::Basic));
let parser = QueryParser::for_index(&self.index, default_fields);
let (user_query, _) = parser.parse_query_lenient(user_query);
let query = BooleanQuery::new(vec![
(Occur::Must, doc_type_query),
(Occur::Must, user_query),
]);
let searcher = self.reader.searcher();
let results = searcher.search(&query, &TopDocs::with_limit(10000).order_by_score())?;
let mut iris = vec![];
for (_score, address) in results.iter() {
let doc: TantivyDocument = searcher.doc(*address)?;
if let Some(doc_iri) = doc.get_first(Schema::iri_field()) {
let doc_iri_string = doc_iri.as_str().unwrap_or("???").to_string();
iris.push(doc_iri_string);
}
}
Ok(iris)
}
}
+9
View File
@@ -0,0 +1,9 @@
mod error;
mod index;
mod schema;
pub use tantivy::TantivyDocument as SearchDocument;
pub use error::{Result, SearchError};
pub use index::{SearchIndex, SearchIndexBuilder};
pub use schema::Schema;
+26
View File
@@ -0,0 +1,26 @@
use std::path::Path;
use tantivy::{doc, TantivyDocument};
use tracing::info;
use gl_types::CatalogEntryType;
use crate::Schema;
pub struct PdfTextExtractor {}
impl PdfTextExtractor {
pub fn process<P: AsRef<Path>>(path: P) -> Vec<TantivyDocument> {
let document = poppler::Document::from_file(path.as_ref().to_str().unwrap(), None).unwrap();
let mut tantivy_documents = vec![];
for page in 0..document.n_pages() {
info!("Processing page {}", page);
let page = document.page(page).unwrap();
let text = page.text().unwrap();
let clean_text = text.as_str().replace("-\n", "").replace("\n", " ");
let document = doc!(
Schema::type_field() => u64::from(CatalogEntryType::AudioBook),
Schema::content_field() => clean_text.as_str(),
);
tantivy_documents.push(document);
}
tantivy_documents
}
}
+77
View File
@@ -0,0 +1,77 @@
use std::sync::OnceLock;
use tantivy::schema;
use tantivy::schema::{
Field, IndexRecordOption, Schema as TantivySchema, TextFieldIndexing, TextOptions,
};
static SCHEMA: OnceLock<TantivySchema> = OnceLock::new();
pub struct Schema;
impl Schema {
pub fn schema() -> &'static TantivySchema {
SCHEMA.get_or_init(|| {
let ngram_32 = TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
.set_tokenizer("ngram_32"),
);
let en_stem = TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
.set_tokenizer("en_stem"),
);
let mut schema_builder = TantivySchema::builder();
schema_builder.add_u64_field("type", schema::FAST | schema::INDEXED);
schema_builder.add_text_field("iri", schema::STORED | schema::STRING);
schema_builder.add_text_field("label", ngram_32.clone());
schema_builder.add_text_field("comment", en_stem.clone());
schema_builder.add_text_field("given name", ngram_32.clone());
schema_builder.add_text_field("surname", ngram_32);
schema_builder.add_text_field("title", en_stem.clone());
schema_builder.add_text_field("description", en_stem.clone());
schema_builder.add_text_field("content", en_stem);
schema_builder.add_u64_field("page", schema::STORED);
schema_builder.add_u64_field("book", schema::STORED);
schema_builder.add_u64_field("chapter", schema::STORED);
schema_builder.add_u64_field("verse", schema::STORED);
schema_builder.build()
})
}
pub fn type_field() -> Field {
Self::schema().get_field("type").unwrap()
}
pub fn iri_field() -> Field {
Self::schema().get_field("iri").unwrap()
}
pub fn label_field() -> Field {
Self::schema().get_field("label").unwrap()
}
pub fn comment_field() -> Field {
Self::schema().get_field("comment").unwrap()
}
pub fn ontology_fields() -> Vec<Field> {
vec![Self::label_field(), Self::comment_field()]
}
pub fn default_fields() -> Vec<Field> {
vec![
Self::schema().get_field("given name").unwrap(),
Self::schema().get_field("surname").unwrap(),
Self::schema().get_field("title").unwrap(),
Self::schema().get_field("description").unwrap(),
Self::schema().get_field("content").unwrap(),
]
}
}