95 lines
3.7 KiB
Rust
95 lines
3.7 KiB
Rust
|
|
use std::io::{BufReader, Read};
|
||
|
|
use tantivy::{doc, TantivyDocument};
|
||
|
|
use xml::EventReader;
|
||
|
|
use xml::reader::XmlEvent;
|
||
|
|
use crate::Schema;
|
||
|
|
|
||
|
|
/// Usage:
|
||
|
|
/// ```
|
||
|
|
/// let file = File::open("./EnglishNKJBible.xml")?;
|
||
|
|
/// let documents = gl_search::BibleXmlConverter::xml_to_tantivy_documents(9, "https://graphofliberty.org/entity/E0", file);
|
||
|
|
/// let mut output = File::create("./EnglishNKJBible.jsonl")?;
|
||
|
|
/// for document in documents {
|
||
|
|
/// let json = document.to_json(Schema::schema());
|
||
|
|
/// output.write(json.as_bytes())?;
|
||
|
|
/// output.write("\n".as_bytes())?;
|
||
|
|
/// }
|
||
|
|
/// ```
|
||
|
|
pub struct BibleXmlConverter {}
|
||
|
|
impl BibleXmlConverter {
|
||
|
|
pub fn xml_to_tantivy_documents<R: Read>(type_: u64, iri: &str, reader: R) -> Vec<TantivyDocument> {
|
||
|
|
let file = BufReader::new(reader);
|
||
|
|
let parser = EventReader::new(file);
|
||
|
|
let mut documents = vec![];
|
||
|
|
|
||
|
|
let mut book: Option<u64> = None;
|
||
|
|
let mut chapter: Option<u64> = None;
|
||
|
|
let mut verse: Option<u64> = None;
|
||
|
|
let mut content: Option<String> = None;
|
||
|
|
for event in parser {
|
||
|
|
match event {
|
||
|
|
Ok(XmlEvent::StartElement { name, attributes, .. }) => {
|
||
|
|
match name.local_name.as_str() {
|
||
|
|
"book" => {
|
||
|
|
for attr in attributes {
|
||
|
|
if attr.name.local_name == "number" {
|
||
|
|
book = attr.value.parse().ok();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"chapter" => {
|
||
|
|
for attr in attributes {
|
||
|
|
if attr.name.local_name == "number" {
|
||
|
|
chapter = attr.value.parse().ok();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"verse" => {
|
||
|
|
for attr in attributes {
|
||
|
|
if attr.name.local_name == "number" {
|
||
|
|
verse = attr.value.parse().ok();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
_ => {}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
Ok(XmlEvent::Characters(characters)) => {
|
||
|
|
content = Some(characters);
|
||
|
|
}
|
||
|
|
Ok(XmlEvent::EndElement { name }) => {
|
||
|
|
match name.local_name.as_str() {
|
||
|
|
"book" => {
|
||
|
|
book = None;
|
||
|
|
}
|
||
|
|
"chapter" => {
|
||
|
|
chapter = None;
|
||
|
|
}
|
||
|
|
"verse" => {
|
||
|
|
if let Some(book) = book && let Some(chapter) = chapter && let Some(verse) = verse && let Some(ref content) = content {
|
||
|
|
let document = doc!(
|
||
|
|
Schema::type_field() => type_,
|
||
|
|
Schema::iri_field() => iri,
|
||
|
|
Schema::book_field() => book,
|
||
|
|
Schema::chapter_field() => chapter,
|
||
|
|
Schema::verse_field() => verse,
|
||
|
|
Schema::content_field() => content.as_str(),
|
||
|
|
);
|
||
|
|
documents.push(document);
|
||
|
|
}
|
||
|
|
verse = None;
|
||
|
|
}
|
||
|
|
_ => {}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
Err(e) => {
|
||
|
|
eprintln!("Error: {e}");
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
_ => {}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
documents
|
||
|
|
}
|
||
|
|
}
|