Initial commit

2026-06-08 19:33:49 -04:00
commit 6906fb973a
24 changed files with 9382 additions and 0 deletions
@@ -0,0 +1,95 @@
+use std::io::{BufReader, Read};
+use tantivy::{doc, TantivyDocument};
+use xml::EventReader;
+use xml::reader::XmlEvent;
+use crate::Schema;
+
+/// Usage:
+/// ```
+/// let file = File::open("./EnglishNKJBible.xml")?;
+/// let documents = gl_search::BibleXmlConverter::xml_to_tantivy_documents(9, "https://graphofliberty.org/entity/E0", file);
+/// let mut output = File::create("./EnglishNKJBible.jsonl")?;
+/// for document in documents {
+///     let json = document.to_json(Schema::schema());
+///     output.write(json.as_bytes())?;
+///     output.write("\n".as_bytes())?;
+///  }
+/// ```
+pub struct BibleXmlConverter {}
+impl BibleXmlConverter {
+    pub fn xml_to_tantivy_documents<R: Read>(type_: u64, iri: &str, reader: R) -> Vec<TantivyDocument> {
+        let file = BufReader::new(reader);
+        let parser = EventReader::new(file);
+        let mut documents = vec![];
+
+        let mut book: Option<u64> = None;
+        let mut chapter: Option<u64> = None;
+        let mut verse: Option<u64> = None;
+        let mut content: Option<String> = None;
+        for event in parser {
+            match event {
+                Ok(XmlEvent::StartElement { name, attributes, .. }) => {
+                    match name.local_name.as_str() {
+                        "book" => {
+                            for attr in attributes {
+                                if attr.name.local_name == "number" {
+                                    book = attr.value.parse().ok();
+                                }
+                            }
+                        },
+                        "chapter" => {
+                            for attr in attributes {
+                                if attr.name.local_name == "number" {
+                                    chapter = attr.value.parse().ok();
+                                }
+                            }
+                        },
+                        "verse" => {
+                            for attr in attributes {
+                                if attr.name.local_name == "number" {
+                                    verse = attr.value.parse().ok();
+                                }
+                            }
+                        }
+                        _ => {}
+                    }
+                }
+                Ok(XmlEvent::Characters(characters)) => {
+                    content = Some(characters);
+                }
+                Ok(XmlEvent::EndElement { name }) => {
+                    match name.local_name.as_str() {
+                        "book" => {
+                            book = None;
+                        }
+                        "chapter" => {
+                            chapter = None;
+                        }
+                        "verse" => {
+                            if let Some(book) = book && let Some(chapter) = chapter && let Some(verse) = verse && let Some(ref content) = content {
+                                let document = doc!(
+                                    Schema::type_field() => type_,
+                                    Schema::iri_field() => iri,
+                                    Schema::book_field() => book,
+                                    Schema::chapter_field() => chapter,
+                                    Schema::verse_field() => verse,
+                                    Schema::content_field() => content.as_str(),
+                                );
+                                documents.push(document);
+                            }
+                            verse = None;
+                        }
+                        _ => {}
+                    }
+                }
+                Err(e) => {
+                    eprintln!("Error: {e}");
+                    break;
+                }
+                _ => {}
+            }
+        }
+
+        documents
+    }
+}
@@ -0,0 +1,17 @@
+use thiserror::Error;
+pub type Result<R> = std::result::Result<R, SearchError>;
+
+#[derive(Debug, Error)]
+pub enum SearchError {
+    #[error(transparent)]
+    TantivyError(#[from] tantivy::TantivyError),
+
+    #[error(transparent)]
+    DocumentParsingError(#[from] tantivy::schema::DocParsingError),
+
+    #[error(transparent)]
+    OpenDirectoryError(#[from] tantivy::directory::error::OpenDirectoryError),
+
+    #[error("Path to index not specified")]
+    IndexPathNotSpecified,
+}
@@ -0,0 +1,132 @@
+use crate::error;
+use crate::error::SearchError;
+use crate::schema::Schema;
+use std::path::PathBuf;
+use tantivy::collector::TopDocs;
+use tantivy::directory::{ManagedDirectory, MmapDirectory};
+use tantivy::query::{BooleanQuery, Occur, QueryParser, TermQuery};
+use tantivy::schema::{Field, IndexRecordOption, Value};
+use tantivy::tokenizer::{NgramTokenizer, TokenizerManager};
+use tantivy::{Index, IndexReader, IndexWriter, ReloadPolicy, TantivyDocument, Term, doc};
+
+#[derive(Default)]
+pub struct SearchIndexBuilder {
+    path: Option<PathBuf>,
+}
+
+impl SearchIndexBuilder {
+    pub fn with_path(mut self, path: impl Into<PathBuf>) -> Self {
+        self.path = Some(path.into());
+        self
+    }
+
+    pub fn build(self) -> error::Result<SearchIndex> {
+        if let Some(path) = self.path {
+            let ngram_32 = NgramTokenizer::new(1, 32, false)?;
+            let tokenizer_manager = TokenizerManager::default();
+            tokenizer_manager.register("ngram_32", ngram_32);
+
+            let mmap_directory = MmapDirectory::open(path)?;
+            let managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory))?;
+            let index = Index::builder()
+                .schema(Schema::schema().clone())
+                .tokenizers(tokenizer_manager)
+                .open_or_create(managed_directory)?;
+
+            let reader = index
+                .reader_builder()
+                .reload_policy(ReloadPolicy::OnCommitWithDelay)
+                .try_into()?;
+
+            let writer = index.writer(50_000_000)?;
+
+            Ok(SearchIndex {
+                index,
+                reader,
+                writer,
+            })
+        } else {
+            Err(SearchError::IndexPathNotSpecified)
+        }
+    }
+}
+
+pub struct SearchIndex {
+    index: Index,
+    reader: IndexReader,
+    writer: IndexWriter,
+}
+
+impl SearchIndex {
+    pub fn builder() -> SearchIndexBuilder {
+        SearchIndexBuilder::default()
+    }
+
+    pub fn remove_all_annotated_iris(&mut self) {
+        self.writer
+            .delete_term(Term::from_field_u64(Schema::type_field(), 0u64));
+        self.writer.commit().unwrap();
+    }
+
+    pub fn add_annotated_iri(
+        &self,
+        iri: &str,
+        label: Option<&str>,
+        comment: Option<&str>,
+    ) -> crate::Result<()> {
+        let mut document = doc!(
+            Schema::type_field() => 0u64,
+            Schema::iri_field() => iri,
+        );
+
+        if let Some(label) = label {
+            document.add_text(Schema::label_field(), label.to_lowercase());
+        }
+
+        if let Some(comment) = comment {
+            document.add_text(Schema::comment_field(), comment);
+        }
+
+        self.writer.add_document(document)?;
+        Ok(())
+    }
+
+    pub fn add<'a>(&self, document: TantivyDocument) -> crate::Result<()> {
+        self.writer.add_document(document)?;
+        Ok(())
+    }
+
+    pub fn commit(&mut self) -> crate::Result<()> {
+        self.writer.commit()?;
+        Ok(())
+    }
+
+    pub fn query(
+        &self,
+        type_: u64,
+        user_query: &str,
+        default_fields: Vec<Field>,
+    ) -> error::Result<Vec<String>> {
+        let doc_type_term = Term::from_field_u64(Schema::type_field(), type_);
+        let doc_type_query = Box::new(TermQuery::new(doc_type_term, IndexRecordOption::Basic));
+
+        let parser = QueryParser::for_index(&self.index, default_fields);
+        let (user_query, _) = parser.parse_query_lenient(user_query);
+        let query = BooleanQuery::new(vec![
+            (Occur::Must, doc_type_query),
+            (Occur::Must, user_query),
+        ]);
+        let searcher = self.reader.searcher();
+        let results = searcher.search(&query, &TopDocs::with_limit(10000).order_by_score())?;
+        let mut iris = vec![];
+        for (_score, address) in results.iter() {
+            let doc: TantivyDocument = searcher.doc(*address)?;
+            if let Some(doc_iri) = doc.get_first(Schema::iri_field()) {
+                let doc_iri_string = doc_iri.as_str().unwrap_or("???").to_string();
+                iris.push(doc_iri_string);
+            }
+        }
+
+        Ok(iris)
+    }
+}
@@ -0,0 +1,9 @@
+mod error;
+mod index;
+mod schema;
+
+pub use tantivy::TantivyDocument as SearchDocument;
+
+pub use error::{Result, SearchError};
+pub use index::{SearchIndex, SearchIndexBuilder};
+pub use schema::Schema;
@@ -0,0 +1,26 @@
+use std::path::Path;
+use tantivy::{doc, TantivyDocument};
+use tracing::info;
+use gl_types::CatalogEntryType;
+use crate::Schema;
+
+pub struct PdfTextExtractor {}
+
+impl PdfTextExtractor {
+    pub fn process<P: AsRef<Path>>(path: P) -> Vec<TantivyDocument> {
+        let document = poppler::Document::from_file(path.as_ref().to_str().unwrap(), None).unwrap();
+        let mut tantivy_documents = vec![];
+        for page in 0..document.n_pages() {
+            info!("Processing page {}", page);
+            let page = document.page(page).unwrap();
+            let text = page.text().unwrap();
+            let clean_text = text.as_str().replace("-\n", "").replace("\n", " ");
+            let document = doc!(
+                Schema::type_field() => u64::from(CatalogEntryType::AudioBook),
+                Schema::content_field() => clean_text.as_str(),
+            );
+            tantivy_documents.push(document);
+        }
+        tantivy_documents
+    }
+}
@@ -0,0 +1,77 @@
+use std::sync::OnceLock;
+use tantivy::schema;
+use tantivy::schema::{
+    Field, IndexRecordOption, Schema as TantivySchema, TextFieldIndexing, TextOptions,
+};
+
+static SCHEMA: OnceLock<TantivySchema> = OnceLock::new();
+
+pub struct Schema;
+
+impl Schema {
+    pub fn schema() -> &'static TantivySchema {
+        SCHEMA.get_or_init(|| {
+            let ngram_32 = TextOptions::default().set_indexing_options(
+                TextFieldIndexing::default()
+                    .set_index_option(IndexRecordOption::WithFreqsAndPositions)
+                    .set_tokenizer("ngram_32"),
+            );
+
+            let en_stem = TextOptions::default().set_indexing_options(
+                TextFieldIndexing::default()
+                    .set_index_option(IndexRecordOption::WithFreqsAndPositions)
+                    .set_tokenizer("en_stem"),
+            );
+
+            let mut schema_builder = TantivySchema::builder();
+            schema_builder.add_u64_field("type", schema::FAST | schema::INDEXED);
+            schema_builder.add_text_field("iri", schema::STORED | schema::STRING);
+            schema_builder.add_text_field("label", ngram_32.clone());
+            schema_builder.add_text_field("comment", en_stem.clone());
+
+            schema_builder.add_text_field("given name", ngram_32.clone());
+            schema_builder.add_text_field("surname", ngram_32);
+
+            schema_builder.add_text_field("title", en_stem.clone());
+            schema_builder.add_text_field("description", en_stem.clone());
+            schema_builder.add_text_field("content", en_stem);
+
+            schema_builder.add_u64_field("page", schema::STORED);
+            schema_builder.add_u64_field("book", schema::STORED);
+            schema_builder.add_u64_field("chapter", schema::STORED);
+            schema_builder.add_u64_field("verse", schema::STORED);
+
+            schema_builder.build()
+        })
+    }
+
+    pub fn type_field() -> Field {
+        Self::schema().get_field("type").unwrap()
+    }
+
+    pub fn iri_field() -> Field {
+        Self::schema().get_field("iri").unwrap()
+    }
+
+    pub fn label_field() -> Field {
+        Self::schema().get_field("label").unwrap()
+    }
+
+    pub fn comment_field() -> Field {
+        Self::schema().get_field("comment").unwrap()
+    }
+
+    pub fn ontology_fields() -> Vec<Field> {
+        vec![Self::label_field(), Self::comment_field()]
+    }
+
+    pub fn default_fields() -> Vec<Field> {
+        vec![
+            Self::schema().get_field("given name").unwrap(),
+            Self::schema().get_field("surname").unwrap(),
+            Self::schema().get_field("title").unwrap(),
+            Self::schema().get_field("description").unwrap(),
+            Self::schema().get_field("content").unwrap(),
+        ]
+    }
+}