SHA256
1
0
Files
tools/search/src/bible.rs
T
2026-06-08 19:33:49 -04:00

95 lines
3.7 KiB
Rust

use std::io::{BufReader, Read};
use tantivy::{doc, TantivyDocument};
use xml::EventReader;
use xml::reader::XmlEvent;
use crate::Schema;
/// Usage:
/// ```
/// let file = File::open("./EnglishNKJBible.xml")?;
/// let documents = gl_search::BibleXmlConverter::xml_to_tantivy_documents(9, "https://graphofliberty.org/entity/E0", file);
/// let mut output = File::create("./EnglishNKJBible.jsonl")?;
/// for document in documents {
/// let json = document.to_json(Schema::schema());
/// output.write(json.as_bytes())?;
/// output.write("\n".as_bytes())?;
/// }
/// ```
pub struct BibleXmlConverter {}
impl BibleXmlConverter {
pub fn xml_to_tantivy_documents<R: Read>(type_: u64, iri: &str, reader: R) -> Vec<TantivyDocument> {
let file = BufReader::new(reader);
let parser = EventReader::new(file);
let mut documents = vec![];
let mut book: Option<u64> = None;
let mut chapter: Option<u64> = None;
let mut verse: Option<u64> = None;
let mut content: Option<String> = None;
for event in parser {
match event {
Ok(XmlEvent::StartElement { name, attributes, .. }) => {
match name.local_name.as_str() {
"book" => {
for attr in attributes {
if attr.name.local_name == "number" {
book = attr.value.parse().ok();
}
}
},
"chapter" => {
for attr in attributes {
if attr.name.local_name == "number" {
chapter = attr.value.parse().ok();
}
}
},
"verse" => {
for attr in attributes {
if attr.name.local_name == "number" {
verse = attr.value.parse().ok();
}
}
}
_ => {}
}
}
Ok(XmlEvent::Characters(characters)) => {
content = Some(characters);
}
Ok(XmlEvent::EndElement { name }) => {
match name.local_name.as_str() {
"book" => {
book = None;
}
"chapter" => {
chapter = None;
}
"verse" => {
if let Some(book) = book && let Some(chapter) = chapter && let Some(verse) = verse && let Some(ref content) = content {
let document = doc!(
Schema::type_field() => type_,
Schema::iri_field() => iri,
Schema::book_field() => book,
Schema::chapter_field() => chapter,
Schema::verse_field() => verse,
Schema::content_field() => content.as_str(),
);
documents.push(document);
}
verse = None;
}
_ => {}
}
}
Err(e) => {
eprintln!("Error: {e}");
break;
}
_ => {}
}
}
documents
}
}