Initial commit
This commit is contained in:
@@ -0,0 +1,95 @@
|
||||
use std::io::{BufReader, Read};
|
||||
use tantivy::{doc, TantivyDocument};
|
||||
use xml::EventReader;
|
||||
use xml::reader::XmlEvent;
|
||||
use crate::Schema;
|
||||
|
||||
/// Usage:
|
||||
/// ```
|
||||
/// let file = File::open("./EnglishNKJBible.xml")?;
|
||||
/// let documents = gl_search::BibleXmlConverter::xml_to_tantivy_documents(9, "https://graphofliberty.org/entity/E0", file);
|
||||
/// let mut output = File::create("./EnglishNKJBible.jsonl")?;
|
||||
/// for document in documents {
|
||||
/// let json = document.to_json(Schema::schema());
|
||||
/// output.write(json.as_bytes())?;
|
||||
/// output.write("\n".as_bytes())?;
|
||||
/// }
|
||||
/// ```
|
||||
pub struct BibleXmlConverter {}
|
||||
impl BibleXmlConverter {
|
||||
pub fn xml_to_tantivy_documents<R: Read>(type_: u64, iri: &str, reader: R) -> Vec<TantivyDocument> {
|
||||
let file = BufReader::new(reader);
|
||||
let parser = EventReader::new(file);
|
||||
let mut documents = vec![];
|
||||
|
||||
let mut book: Option<u64> = None;
|
||||
let mut chapter: Option<u64> = None;
|
||||
let mut verse: Option<u64> = None;
|
||||
let mut content: Option<String> = None;
|
||||
for event in parser {
|
||||
match event {
|
||||
Ok(XmlEvent::StartElement { name, attributes, .. }) => {
|
||||
match name.local_name.as_str() {
|
||||
"book" => {
|
||||
for attr in attributes {
|
||||
if attr.name.local_name == "number" {
|
||||
book = attr.value.parse().ok();
|
||||
}
|
||||
}
|
||||
},
|
||||
"chapter" => {
|
||||
for attr in attributes {
|
||||
if attr.name.local_name == "number" {
|
||||
chapter = attr.value.parse().ok();
|
||||
}
|
||||
}
|
||||
},
|
||||
"verse" => {
|
||||
for attr in attributes {
|
||||
if attr.name.local_name == "number" {
|
||||
verse = attr.value.parse().ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(XmlEvent::Characters(characters)) => {
|
||||
content = Some(characters);
|
||||
}
|
||||
Ok(XmlEvent::EndElement { name }) => {
|
||||
match name.local_name.as_str() {
|
||||
"book" => {
|
||||
book = None;
|
||||
}
|
||||
"chapter" => {
|
||||
chapter = None;
|
||||
}
|
||||
"verse" => {
|
||||
if let Some(book) = book && let Some(chapter) = chapter && let Some(verse) = verse && let Some(ref content) = content {
|
||||
let document = doc!(
|
||||
Schema::type_field() => type_,
|
||||
Schema::iri_field() => iri,
|
||||
Schema::book_field() => book,
|
||||
Schema::chapter_field() => chapter,
|
||||
Schema::verse_field() => verse,
|
||||
Schema::content_field() => content.as_str(),
|
||||
);
|
||||
documents.push(document);
|
||||
}
|
||||
verse = None;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("Error: {e}");
|
||||
break;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
documents
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
use thiserror::Error;
|
||||
pub type Result<R> = std::result::Result<R, SearchError>;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum SearchError {
|
||||
#[error(transparent)]
|
||||
TantivyError(#[from] tantivy::TantivyError),
|
||||
|
||||
#[error(transparent)]
|
||||
DocumentParsingError(#[from] tantivy::schema::DocParsingError),
|
||||
|
||||
#[error(transparent)]
|
||||
OpenDirectoryError(#[from] tantivy::directory::error::OpenDirectoryError),
|
||||
|
||||
#[error("Path to index not specified")]
|
||||
IndexPathNotSpecified,
|
||||
}
|
||||
@@ -0,0 +1,132 @@
|
||||
use crate::error;
|
||||
use crate::error::SearchError;
|
||||
use crate::schema::Schema;
|
||||
use std::path::PathBuf;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::directory::{ManagedDirectory, MmapDirectory};
|
||||
use tantivy::query::{BooleanQuery, Occur, QueryParser, TermQuery};
|
||||
use tantivy::schema::{Field, IndexRecordOption, Value};
|
||||
use tantivy::tokenizer::{NgramTokenizer, TokenizerManager};
|
||||
use tantivy::{Index, IndexReader, IndexWriter, ReloadPolicy, TantivyDocument, Term, doc};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct SearchIndexBuilder {
|
||||
path: Option<PathBuf>,
|
||||
}
|
||||
|
||||
impl SearchIndexBuilder {
|
||||
pub fn with_path(mut self, path: impl Into<PathBuf>) -> Self {
|
||||
self.path = Some(path.into());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(self) -> error::Result<SearchIndex> {
|
||||
if let Some(path) = self.path {
|
||||
let ngram_32 = NgramTokenizer::new(1, 32, false)?;
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
tokenizer_manager.register("ngram_32", ngram_32);
|
||||
|
||||
let mmap_directory = MmapDirectory::open(path)?;
|
||||
let managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory))?;
|
||||
let index = Index::builder()
|
||||
.schema(Schema::schema().clone())
|
||||
.tokenizers(tokenizer_manager)
|
||||
.open_or_create(managed_directory)?;
|
||||
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommitWithDelay)
|
||||
.try_into()?;
|
||||
|
||||
let writer = index.writer(50_000_000)?;
|
||||
|
||||
Ok(SearchIndex {
|
||||
index,
|
||||
reader,
|
||||
writer,
|
||||
})
|
||||
} else {
|
||||
Err(SearchError::IndexPathNotSpecified)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SearchIndex {
|
||||
index: Index,
|
||||
reader: IndexReader,
|
||||
writer: IndexWriter,
|
||||
}
|
||||
|
||||
impl SearchIndex {
|
||||
pub fn builder() -> SearchIndexBuilder {
|
||||
SearchIndexBuilder::default()
|
||||
}
|
||||
|
||||
pub fn remove_all_annotated_iris(&mut self) {
|
||||
self.writer
|
||||
.delete_term(Term::from_field_u64(Schema::type_field(), 0u64));
|
||||
self.writer.commit().unwrap();
|
||||
}
|
||||
|
||||
pub fn add_annotated_iri(
|
||||
&self,
|
||||
iri: &str,
|
||||
label: Option<&str>,
|
||||
comment: Option<&str>,
|
||||
) -> crate::Result<()> {
|
||||
let mut document = doc!(
|
||||
Schema::type_field() => 0u64,
|
||||
Schema::iri_field() => iri,
|
||||
);
|
||||
|
||||
if let Some(label) = label {
|
||||
document.add_text(Schema::label_field(), label.to_lowercase());
|
||||
}
|
||||
|
||||
if let Some(comment) = comment {
|
||||
document.add_text(Schema::comment_field(), comment);
|
||||
}
|
||||
|
||||
self.writer.add_document(document)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn add<'a>(&self, document: TantivyDocument) -> crate::Result<()> {
|
||||
self.writer.add_document(document)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn commit(&mut self) -> crate::Result<()> {
|
||||
self.writer.commit()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn query(
|
||||
&self,
|
||||
type_: u64,
|
||||
user_query: &str,
|
||||
default_fields: Vec<Field>,
|
||||
) -> error::Result<Vec<String>> {
|
||||
let doc_type_term = Term::from_field_u64(Schema::type_field(), type_);
|
||||
let doc_type_query = Box::new(TermQuery::new(doc_type_term, IndexRecordOption::Basic));
|
||||
|
||||
let parser = QueryParser::for_index(&self.index, default_fields);
|
||||
let (user_query, _) = parser.parse_query_lenient(user_query);
|
||||
let query = BooleanQuery::new(vec![
|
||||
(Occur::Must, doc_type_query),
|
||||
(Occur::Must, user_query),
|
||||
]);
|
||||
let searcher = self.reader.searcher();
|
||||
let results = searcher.search(&query, &TopDocs::with_limit(10000).order_by_score())?;
|
||||
let mut iris = vec![];
|
||||
for (_score, address) in results.iter() {
|
||||
let doc: TantivyDocument = searcher.doc(*address)?;
|
||||
if let Some(doc_iri) = doc.get_first(Schema::iri_field()) {
|
||||
let doc_iri_string = doc_iri.as_str().unwrap_or("???").to_string();
|
||||
iris.push(doc_iri_string);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(iris)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
mod error;
|
||||
mod index;
|
||||
mod schema;
|
||||
|
||||
pub use tantivy::TantivyDocument as SearchDocument;
|
||||
|
||||
pub use error::{Result, SearchError};
|
||||
pub use index::{SearchIndex, SearchIndexBuilder};
|
||||
pub use schema::Schema;
|
||||
@@ -0,0 +1,26 @@
|
||||
use std::path::Path;
|
||||
use tantivy::{doc, TantivyDocument};
|
||||
use tracing::info;
|
||||
use gl_types::CatalogEntryType;
|
||||
use crate::Schema;
|
||||
|
||||
pub struct PdfTextExtractor {}
|
||||
|
||||
impl PdfTextExtractor {
|
||||
pub fn process<P: AsRef<Path>>(path: P) -> Vec<TantivyDocument> {
|
||||
let document = poppler::Document::from_file(path.as_ref().to_str().unwrap(), None).unwrap();
|
||||
let mut tantivy_documents = vec![];
|
||||
for page in 0..document.n_pages() {
|
||||
info!("Processing page {}", page);
|
||||
let page = document.page(page).unwrap();
|
||||
let text = page.text().unwrap();
|
||||
let clean_text = text.as_str().replace("-\n", "").replace("\n", " ");
|
||||
let document = doc!(
|
||||
Schema::type_field() => u64::from(CatalogEntryType::AudioBook),
|
||||
Schema::content_field() => clean_text.as_str(),
|
||||
);
|
||||
tantivy_documents.push(document);
|
||||
}
|
||||
tantivy_documents
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
use std::sync::OnceLock;
|
||||
use tantivy::schema;
|
||||
use tantivy::schema::{
|
||||
Field, IndexRecordOption, Schema as TantivySchema, TextFieldIndexing, TextOptions,
|
||||
};
|
||||
|
||||
static SCHEMA: OnceLock<TantivySchema> = OnceLock::new();
|
||||
|
||||
pub struct Schema;
|
||||
|
||||
impl Schema {
|
||||
pub fn schema() -> &'static TantivySchema {
|
||||
SCHEMA.get_or_init(|| {
|
||||
let ngram_32 = TextOptions::default().set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
|
||||
.set_tokenizer("ngram_32"),
|
||||
);
|
||||
|
||||
let en_stem = TextOptions::default().set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
|
||||
.set_tokenizer("en_stem"),
|
||||
);
|
||||
|
||||
let mut schema_builder = TantivySchema::builder();
|
||||
schema_builder.add_u64_field("type", schema::FAST | schema::INDEXED);
|
||||
schema_builder.add_text_field("iri", schema::STORED | schema::STRING);
|
||||
schema_builder.add_text_field("label", ngram_32.clone());
|
||||
schema_builder.add_text_field("comment", en_stem.clone());
|
||||
|
||||
schema_builder.add_text_field("given name", ngram_32.clone());
|
||||
schema_builder.add_text_field("surname", ngram_32);
|
||||
|
||||
schema_builder.add_text_field("title", en_stem.clone());
|
||||
schema_builder.add_text_field("description", en_stem.clone());
|
||||
schema_builder.add_text_field("content", en_stem);
|
||||
|
||||
schema_builder.add_u64_field("page", schema::STORED);
|
||||
schema_builder.add_u64_field("book", schema::STORED);
|
||||
schema_builder.add_u64_field("chapter", schema::STORED);
|
||||
schema_builder.add_u64_field("verse", schema::STORED);
|
||||
|
||||
schema_builder.build()
|
||||
})
|
||||
}
|
||||
|
||||
pub fn type_field() -> Field {
|
||||
Self::schema().get_field("type").unwrap()
|
||||
}
|
||||
|
||||
pub fn iri_field() -> Field {
|
||||
Self::schema().get_field("iri").unwrap()
|
||||
}
|
||||
|
||||
pub fn label_field() -> Field {
|
||||
Self::schema().get_field("label").unwrap()
|
||||
}
|
||||
|
||||
pub fn comment_field() -> Field {
|
||||
Self::schema().get_field("comment").unwrap()
|
||||
}
|
||||
|
||||
pub fn ontology_fields() -> Vec<Field> {
|
||||
vec![Self::label_field(), Self::comment_field()]
|
||||
}
|
||||
|
||||
pub fn default_fields() -> Vec<Field> {
|
||||
vec![
|
||||
Self::schema().get_field("given name").unwrap(),
|
||||
Self::schema().get_field("surname").unwrap(),
|
||||
Self::schema().get_field("title").unwrap(),
|
||||
Self::schema().get_field("description").unwrap(),
|
||||
Self::schema().get_field("content").unwrap(),
|
||||
]
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user