diff --git a/scraper/src/nz/co/dewar/biblescraper/DictionaryBuilder.java b/scraper/src/nz/co/dewar/biblescraper/DictionaryBuilder.java
new file mode 100644
index 0000000..ac54140
--- /dev/null
+++ b/scraper/src/nz/co/dewar/biblescraper/DictionaryBuilder.java
@@ -0,0 +1,52 @@
+package nz.co.dewar.biblescraper;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+public class DictionaryBuilder {
+ static String fileName = "NIV.xml";
+ static String outputFileName = "verses.txt";
+ public static void main(String[] args) throws IOException{
+ Document doc = Jsoup.parse(new File(fileName), "UTF-8");
+ Elements verses = doc.select("verse");
+ // Remove footnotes and cross-references
+ verses.select("note").remove();
+ PrintStream output = new PrintStream(new File(outputFileName), "UTF-8");
+ for(Element verse : verses){
+ // Get the raw text of the verse
+ String verseText = verse.text();
+ // Turn dashes into spaces (the code below will remove them and inappropriately conjoin words)
+ verseText = verseText.replaceAll("—", " ").trim();
+ // Clean it up (remove all punctuation, excess spacing, etc)
+ verseText = verseText.replaceAll("[^a-zA-Z0-9 ]", "").trim();
+ while(verseText.contains(" ")){
+ verseText = verseText.replaceAll(" ", " ");
+ }
+ // Make it be upper case for case-insensitive matching
+ verseText = verseText.toUpperCase();
+ // Write it out
+ output.println(verseText);
+ }
+ output.close();
+ }
diff --git a/scraper/src/nz/co/dewar/biblescraper/Main.java b/scraper/src/nz/co/dewar/biblescraper/Main.java
new file mode 100644
index 0000000..5117406
--- /dev/null
+++ b/scraper/src/nz/co/dewar/biblescraper/Main.java
@@ -0,0 +1,236 @@
+package nz.co.dewar.biblescraper;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.Scanner;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.parser.Tag;
+import org.jsoup.select.Elements;
+public class Main {
+ static final int NUM_BOOKS_OT = 39; //39;
+ static final int NUM_BOOKS_NT = 27; //27;
+ //static final String source = "D:\\George\\Development\\BibleGateway2OSIS\\{BOOK}-{CHAPTER}-{VERSION}.html";
+ static final String source = "http://mobile.biblegateway.com/passage/?search={BOOK}%20{CHAPTER}&version={VERSION}";
+ static String version = "NKJV";
+ public static void main(String[] args) throws Exception{
+ String header = "\n"
+ + "\n"
+ + "\n\n"
+ + "\n"
+ + "\t\n"
+ + "\t\t" + version + "\n"
+ + "\t\n"
+ + "\n\n";
+ String footer = "\n\n\n";
+ // Get all the books of the old and new testament
+ Scanner bookScanner = new Scanner(new File("books.dat"));
+ Element oldTestament = getTestament(bookScanner, NUM_BOOKS_OT);
+ Element newTestament = getTestament(bookScanner, NUM_BOOKS_NT);
+ bookScanner.close();
+ // Output passage to String
+ String passageXml = (oldTestament.outerHtml() + newTestament.outerHtml())
+ .replaceAll("osisid", "osisID")
+ .replaceAll("osisref", "osisRef")
+ .replaceAll("divinename", "divineName");
+ // Remove whitespace from before a tag to fix mystery blank cross-reference in AndBible
+ passageXml = passageXml.replaceAll("[ \t\r\n]*", "");
+ // Print output to file
+ PrintStream output = new PrintStream(new File(version + ".xml"), "UTF-8");
+ output.print(header + passageXml + footer);
+ output.close();
+ }
+ static Element getTestament(Scanner scanner, int numBooks) throws IOException{
+ Element testament = new Element(Tag.valueOf("div"), "")
+ .attr("type", "x-testament");
+ for(int i=0; i in page
+ String id = footnote.select("a").first().attr("href").substring(1);
+ // Determine letter of footnote
+ String letter = footnote.select("a").first().text();
+ // Transform tag into tag with required attributes
+ footnote
+ .tagName("note")
+ .removeAttr("class").removeAttr("value")
+ .attr("n", letter)
+ .attr("osisRef", osisId)
+ .attr("osisID", osisId + "!" + letter);
+ // Grab the content of the footnote from the footnotes section
+ Elements noteContent = passage.select("#" + id);
+ // The first link in a footnote is not required
+ noteContent.select("a").first().remove();
+ // Subsequent links are to verses, and the text of these must be kept
+ noteContent.select("a").unwrap();
+ // Change italic text to correct tag
+ noteContent.select("i").tagName("hi").attr("type", "italic");
+ footnote.html(noteContent.html());
+ }
+ // Handle cross-references
+ for(Element crossref : verse.select(".crossreference")){
+ String linkValue = crossref.attr("value");
+ // Determine ID of content in page
+ String id = linkValue.substring(linkValue.indexOf("#") + 1);
+ id = id.substring(0, id.indexOf("\""));
+ // Determine letter of footnote
+ String letter = linkValue.substring(linkValue.indexOf(">") + 1);
+ letter = letter.substring(0, letter.indexOf("<"));
+ // Transform tag into tag with required attributes
+ crossref
+ .tagName("note")
+ .removeAttr("class").removeAttr("value")
+ .attr("type", "crossReference")
+ .attr("n", letter)
+ .attr("osisID", osisId + "!crossReference." + letter);
+ // Grab the cross-reference IDs from the footnote link
+ String[] refs = passage.select("#" + id).select("a").get(1).attr("data-bibleref").split(",");
+ for(int i=0; i tag
+ Element refEl = createTag("reference").attr("osisRef", ref);
+ // Change the osisID notation into readable notation, with spaces and :s
+ refEl.html(ref.replaceFirst("\\.", " ").replaceFirst("\\.", ":").replaceFirst("\\.", " ").replaceFirst("\\.", ":"));
+ // Append it to the element
+ crossref.appendChild(refEl);
+ // Append the semicolon which the OSIS spec dictates
+ if(i < refs.length - 1)
+ crossref.append("; ");
+ }
+ }
+ }
+ // Convert poetry div to lg
+ passage.select("div.poetry p").tagName("lg");
+ passage.select("lg verse").wrap("");
+ passage.select("div.poetry").tagName("p").removeAttr("class");
+ passage.select("lg br").remove();
+ // Convert h3s to titles
+ passage.select("h3 span").unwrap();
+ passage.select("h3").tagName("title");
+ // Convert I tags to transchange
+ passage.select("i").tagName("transchange").attr("type", "added");
+ // Ignore words of Jesus
+ passage.select(".woj").unwrap();
+ // Remove footnotes / cross-references sections (which have served their purpose)
+ passage.select(".footnotes, .crossrefs").remove();
+ return passage;
+ }
+ private static void removeComments(Node node) {
+ for (int i = 0; i < node.childNodes().size();) {
+ Node child = node.childNode(i);
+ if (child.nodeName().equals("#comment"))
+ child.remove();
+ else {
+ removeComments(child);
+ i++;
+ }
+ }
+ }
+ private static Element createTag(String tagName){
+ return new Element(Tag.valueOf(tagName), "");
+ }
diff --git a/scraper/src/nz/co/dewar/biblescraper/MainNoFootnotes.java b/scraper/src/nz/co/dewar/biblescraper/MainNoFootnotes.java
new file mode 100644
index 0000000..caa38ee
--- /dev/null
+++ b/scraper/src/nz/co/dewar/biblescraper/MainNoFootnotes.java
@@ -0,0 +1,162 @@
+package nz.co.dewar.biblescraper;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.Scanner;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.parser.Tag;
+import org.jsoup.select.Elements;
+public class MainNoFootnotes {
+ static final int NUM_BOOKS_OT = 39;
+ static final int NUM_BOOKS_NT = 27;
+ //static final String source = "D:\\George\\Development\\BibleGateway2OSIS\\{BOOK}-{CHAPTER}-{VERSION}.html";
+ static final String source = "http://mobile.biblegateway.com/passage/?search={BOOK}%20{CHAPTER}&version={VERSION}";
+ static String version = "NKJV";
+ public static void main(String[] args) throws Exception{
+ String header = "\n"
+ + "\n"
+ + "\n\n"
+ + "\n"
+ + "\t\n"
+ + "\t\t" + version + "\n"
+ + "\t\n"
+ + "\n\n";
+ String footer = "\n\n\n";
+ // Get all the books of the old and new testament
+ Scanner bookScanner = new Scanner(new File("books.dat"));
+ Element oldTestament = getTestament(bookScanner, NUM_BOOKS_OT);
+ Element newTestament = getTestament(bookScanner, NUM_BOOKS_NT);
+ bookScanner.close();
+ // Output passage to String
+ String passageXml = (oldTestament.outerHtml() + newTestament.outerHtml())
+ .replaceAll("osisid", "osisID");
+ // Print output to file
+ PrintStream output = new PrintStream(new File(version + ".xml"), "UTF-8");
+ output.print(header + passageXml + footer);
+ output.close();
+ }
+ static Element getTestament(Scanner scanner, int numBooks) throws IOException{
+ Element testament = new Element(Tag.valueOf("div"), "")
+ .attr("type", "x-testament");
+ for(int i=0; i");
+ passage.select("div.poetry").tagName("p").removeAttr("class");
+ passage.select("lg br").remove();
+ // Convert h3s to titles
+ passage.select("h3 span").unwrap();
+ passage.select("h3").tagName("title");
+ // Convert I tags to transchange
+ passage.select("i").tagName("transchange").attr("type", "added");
+ // Ignore words of Jesus
+ passage.select(".woj").unwrap();
+ return passage;
+ }
+ private static void removeComments(Node node) {
+ for (int i = 0; i < node.childNodes().size();) {
+ Node child = node.childNode(i);
+ if (child.nodeName().equals("#comment"))
+ child.remove();
+ else {
+ removeComments(child);
+ i++;
+ }
+ }
+ }
