Initial commit
This commit is contained in:
@@ -0,0 +1,95 @@
|
||||
use std::io::{BufReader, Read};
|
||||
use tantivy::{doc, TantivyDocument};
|
||||
use xml::EventReader;
|
||||
use xml::reader::XmlEvent;
|
||||
use crate::Schema;
|
||||
|
||||
/// Usage:
|
||||
/// ```
|
||||
/// let file = File::open("./EnglishNKJBible.xml")?;
|
||||
/// let documents = gl_search::BibleXmlConverter::xml_to_tantivy_documents(9, "https://graphofliberty.org/entity/E0", file);
|
||||
/// let mut output = File::create("./EnglishNKJBible.jsonl")?;
|
||||
/// for document in documents {
|
||||
/// let json = document.to_json(Schema::schema());
|
||||
/// output.write(json.as_bytes())?;
|
||||
/// output.write("\n".as_bytes())?;
|
||||
/// }
|
||||
/// ```
|
||||
pub struct BibleXmlConverter {}
|
||||
impl BibleXmlConverter {
|
||||
pub fn xml_to_tantivy_documents<R: Read>(type_: u64, iri: &str, reader: R) -> Vec<TantivyDocument> {
|
||||
let file = BufReader::new(reader);
|
||||
let parser = EventReader::new(file);
|
||||
let mut documents = vec![];
|
||||
|
||||
let mut book: Option<u64> = None;
|
||||
let mut chapter: Option<u64> = None;
|
||||
let mut verse: Option<u64> = None;
|
||||
let mut content: Option<String> = None;
|
||||
for event in parser {
|
||||
match event {
|
||||
Ok(XmlEvent::StartElement { name, attributes, .. }) => {
|
||||
match name.local_name.as_str() {
|
||||
"book" => {
|
||||
for attr in attributes {
|
||||
if attr.name.local_name == "number" {
|
||||
book = attr.value.parse().ok();
|
||||
}
|
||||
}
|
||||
},
|
||||
"chapter" => {
|
||||
for attr in attributes {
|
||||
if attr.name.local_name == "number" {
|
||||
chapter = attr.value.parse().ok();
|
||||
}
|
||||
}
|
||||
},
|
||||
"verse" => {
|
||||
for attr in attributes {
|
||||
if attr.name.local_name == "number" {
|
||||
verse = attr.value.parse().ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(XmlEvent::Characters(characters)) => {
|
||||
content = Some(characters);
|
||||
}
|
||||
Ok(XmlEvent::EndElement { name }) => {
|
||||
match name.local_name.as_str() {
|
||||
"book" => {
|
||||
book = None;
|
||||
}
|
||||
"chapter" => {
|
||||
chapter = None;
|
||||
}
|
||||
"verse" => {
|
||||
if let Some(book) = book && let Some(chapter) = chapter && let Some(verse) = verse && let Some(ref content) = content {
|
||||
let document = doc!(
|
||||
Schema::type_field() => type_,
|
||||
Schema::iri_field() => iri,
|
||||
Schema::book_field() => book,
|
||||
Schema::chapter_field() => chapter,
|
||||
Schema::verse_field() => verse,
|
||||
Schema::content_field() => content.as_str(),
|
||||
);
|
||||
documents.push(document);
|
||||
}
|
||||
verse = None;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("Error: {e}");
|
||||
break;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
documents
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user