use std::io::{BufReader, Read}; use tantivy::{doc, TantivyDocument}; use xml::EventReader; use xml::reader::XmlEvent; use crate::Schema; /// Usage: /// ``` /// let file = File::open("./EnglishNKJBible.xml")?; /// let documents = gl_search::BibleXmlConverter::xml_to_tantivy_documents(9, "https://graphofliberty.org/entity/E0", file); /// let mut output = File::create("./EnglishNKJBible.jsonl")?; /// for document in documents { /// let json = document.to_json(Schema::schema()); /// output.write(json.as_bytes())?; /// output.write("\n".as_bytes())?; /// } /// ``` pub struct BibleXmlConverter {} impl BibleXmlConverter { pub fn xml_to_tantivy_documents(type_: u64, iri: &str, reader: R) -> Vec { let file = BufReader::new(reader); let parser = EventReader::new(file); let mut documents = vec![]; let mut book: Option = None; let mut chapter: Option = None; let mut verse: Option = None; let mut content: Option = None; for event in parser { match event { Ok(XmlEvent::StartElement { name, attributes, .. }) => { match name.local_name.as_str() { "book" => { for attr in attributes { if attr.name.local_name == "number" { book = attr.value.parse().ok(); } } }, "chapter" => { for attr in attributes { if attr.name.local_name == "number" { chapter = attr.value.parse().ok(); } } }, "verse" => { for attr in attributes { if attr.name.local_name == "number" { verse = attr.value.parse().ok(); } } } _ => {} } } Ok(XmlEvent::Characters(characters)) => { content = Some(characters); } Ok(XmlEvent::EndElement { name }) => { match name.local_name.as_str() { "book" => { book = None; } "chapter" => { chapter = None; } "verse" => { if let Some(book) = book && let Some(chapter) = chapter && let Some(verse) = verse && let Some(ref content) = content { let document = doc!( Schema::type_field() => type_, Schema::iri_field() => iri, Schema::book_field() => book, Schema::chapter_field() => chapter, Schema::verse_field() => verse, Schema::content_field() => content.as_str(), ); documents.push(document); } verse = None; } _ => {} } } Err(e) => { eprintln!("Error: {e}"); break; } _ => {} } } documents } }