From a82b88e904245af14f618d5835ff58087af6303b Mon Sep 17 00:00:00 2001 From: George Dewar Date: Mon, 20 Jan 2014 20:00:56 +1300 Subject: [PATCH] Initial Commit --- .gitattributes | 22 ++ .gitignore | 221 ++++++++++++++++ scraper/books.dat | 66 +++++ scraper/pom.xml | 28 +++ .../dewar/biblescraper/DictionaryBuilder.java | 52 ++++ .../src/nz/co/dewar/biblescraper/Main.java | 236 ++++++++++++++++++ .../dewar/biblescraper/MainNoFootnotes.java | 162 ++++++++++++ script/build.cmd | 21 ++ script/push.cmd | 19 ++ 9 files changed, 827 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 scraper/books.dat create mode 100644 scraper/pom.xml create mode 100644 scraper/src/nz/co/dewar/biblescraper/DictionaryBuilder.java create mode 100644 scraper/src/nz/co/dewar/biblescraper/Main.java create mode 100644 scraper/src/nz/co/dewar/biblescraper/MainNoFootnotes.java create mode 100644 script/build.cmd create mode 100644 script/push.cmd diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..412eeda --- /dev/null +++ b/.gitattributes @@ -0,0 +1,22 @@ +# Auto detect text files and perform LF normalization +* text=auto + +# Custom for Visual Studio +*.cs diff=csharp +*.sln merge=union +*.csproj merge=union +*.vbproj merge=union +*.fsproj merge=union +*.dbproj merge=union + +# Standard to msysgit +*.doc diff=astextplain +*.DOC diff=astextplain +*.docx diff=astextplain +*.DOCX diff=astextplain +*.dot diff=astextplain +*.DOT diff=astextplain +*.pdf diff=astextplain +*.PDF diff=astextplain +*.rtf diff=astextplain +*.RTF diff=astextplain diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6488272 --- /dev/null +++ b/.gitignore @@ -0,0 +1,221 @@ +################# +## Eclipse +################# + +*.pydevproject +.project +.metadata +bin/ +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.classpath +.settings/ +.loadpath + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# CDT-specific +.cproject + +# PDT-specific +.buildpath + + +################# +## Visual Studio +################# + +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. + +# User-specific files +*.suo +*.user +*.sln.docstates + +# Build results + +[Dd]ebug/ +[Rr]elease/ +x64/ +build/ +[Bb]in/ +[Oo]bj/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +*_i.c +*_p.c +*.ilk +*.meta +*.obj +*.pch +*.pdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.log +*.scc + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opensdf +*.sdf +*.cachefile + +# Visual Studio profiler +*.psess +*.vsp +*.vspx + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# NCrunch +*.ncrunch* +.*crunch*.local.xml + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.Publish.xml +*.pubxml + +# NuGet Packages Directory +## TODO: If you have NuGet Package Restore enabled, uncomment the next line +#packages/ + +# Windows Azure Build Output +csx +*.build.csdef + +# Windows Store app package directory +AppPackages/ + +# Others +sql/ +*.Cache +ClientBin/ +[Ss]tyle[Cc]op.* +~$* +*~ +*.dbmdl +*.[Pp]ublish.xml +*.pfx +*.publishsettings + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file to a newer +# Visual Studio version. Backup files are not needed, because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm + +# SQL Server files +App_Data/*.mdf +App_Data/*.ldf + +############# +## Windows detritus +############# + +# Windows image file caches +Thumbs.db +ehthumbs.db + +# Folder config file +Desktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Mac crap +.DS_Store + + +############# +## Python +############# + +*.py[co] + +# Packages +*.egg +*.egg-info +dist/ +build/ +eggs/ +parts/ +var/ +sdist/ +develop-eggs/ +.installed.cfg + +# Installer logs +pip-log.txt + +# Unit test / coverage reports +.coverage +.tox + +#Translations +*.mo + +#Mr Developer +.mr.developer.cfg + +############# +## Java +############# + +target/ \ No newline at end of file diff --git a/scraper/books.dat b/scraper/books.dat new file mode 100644 index 0000000..860bb22 --- /dev/null +++ b/scraper/books.dat @@ -0,0 +1,66 @@ +Gen 50 +Exod 40 +Lev 27 +Num 36 +Deut 34 +Josh 24 +Judg 21 +Ruth 4 +1Sam 31 +2Sam 24 +1Kgs 22 +2Kgs 25 +1Chr 29 +2Chr 36 +Ezra 10 +Neh 13 +Esth 10 +Job 42 +Ps 150 +Prov 31 +Eccl 12 +Song 8 +Isa 66 +Jer 52 +Lam 5 +Ezek 48 +Dan 12 +Hos 14 +Joel 3 +Amos 9 +Obad 1 +Jonah 4 +Mic 7 +Nah 3 +Hab 3 +Zeph 3 +Hag 2 +Zech 14 +Mal 4 +Matt 28 +Mark 16 +Luke 24 +John 21 +Acts 28 +Rom 16 +1Cor 16 +2Cor 13 +Gal 6 +Eph 6 +Phil 4 +Col 4 +1Thess 5 +2Thess 3 +1Tim 6 +2Tim 4 +Titus 3 +Phlm 1 +Heb 13 +Jas 5 +1Pet 5 +2Pet 3 +1John 5 +2John 1 +3John 1 +Jude 1 +Rev 22 diff --git a/scraper/pom.xml b/scraper/pom.xml new file mode 100644 index 0000000..7433966 --- /dev/null +++ b/scraper/pom.xml @@ -0,0 +1,28 @@ + + 4.0.0 + nz.co.dewar + biblescraper + 0.0.1-SNAPSHOT + + src + + + maven-compiler-plugin + 3.1 + + 1.6 + 1.6 + + + + + + + + org.jsoup + jsoup + 1.7.2 + + + \ No newline at end of file diff --git a/scraper/src/nz/co/dewar/biblescraper/DictionaryBuilder.java b/scraper/src/nz/co/dewar/biblescraper/DictionaryBuilder.java new file mode 100644 index 0000000..ac54140 --- /dev/null +++ b/scraper/src/nz/co/dewar/biblescraper/DictionaryBuilder.java @@ -0,0 +1,52 @@ +package nz.co.dewar.biblescraper; + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +public class DictionaryBuilder { + + static String fileName = "NIV.xml"; + static String outputFileName = "verses.txt"; + + public static void main(String[] args) throws IOException{ + + Document doc = Jsoup.parse(new File(fileName), "UTF-8"); + Elements verses = doc.select("verse"); + + // Remove footnotes and cross-references + verses.select("note").remove(); + + PrintStream output = new PrintStream(new File(outputFileName), "UTF-8"); + + for(Element verse : verses){ + // Get the raw text of the verse + String verseText = verse.text(); + + // Turn dashes into spaces (the code below will remove them and inappropriately conjoin words) + verseText = verseText.replaceAll("—", " ").trim(); + + // Clean it up (remove all punctuation, excess spacing, etc) + verseText = verseText.replaceAll("[^a-zA-Z0-9 ]", "").trim(); + while(verseText.contains(" ")){ + verseText = verseText.replaceAll(" ", " "); + } + + // Make it be upper case for case-insensitive matching + verseText = verseText.toUpperCase(); + + // Write it out + output.println(verseText); + } + + output.close(); + } + + + +} diff --git a/scraper/src/nz/co/dewar/biblescraper/Main.java b/scraper/src/nz/co/dewar/biblescraper/Main.java new file mode 100644 index 0000000..5117406 --- /dev/null +++ b/scraper/src/nz/co/dewar/biblescraper/Main.java @@ -0,0 +1,236 @@ +package nz.co.dewar.biblescraper; + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; +import java.util.Scanner; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.parser.Tag; +import org.jsoup.select.Elements; + +public class Main { + + static final int NUM_BOOKS_OT = 39; //39; + static final int NUM_BOOKS_NT = 27; //27; + + //static final String source = "D:\\George\\Development\\BibleGateway2OSIS\\{BOOK}-{CHAPTER}-{VERSION}.html"; + static final String source = "http://mobile.biblegateway.com/passage/?search={BOOK}%20{CHAPTER}&version={VERSION}"; + + static String version = "NKJV"; + + public static void main(String[] args) throws Exception{ + + String header = "\n" + + "\n" + + + "\n\n" + + "
\n" + + "\t\n" + + "\t\t" + version + "\n" + + "\t\n" + + "
\n\n"; + + String footer = "\n
\n
\n"; + + // Get all the books of the old and new testament + Scanner bookScanner = new Scanner(new File("books.dat")); + Element oldTestament = getTestament(bookScanner, NUM_BOOKS_OT); + Element newTestament = getTestament(bookScanner, NUM_BOOKS_NT); + bookScanner.close(); + + // Output passage to String + String passageXml = (oldTestament.outerHtml() + newTestament.outerHtml()) + .replaceAll("osisid", "osisID") + .replaceAll("osisref", "osisRef") + .replaceAll("divinename", "divineName"); + + // Remove whitespace from before a tag to fix mystery blank cross-reference in AndBible + passageXml = passageXml.replaceAll("[ \t\r\n]*", ""); + + // Print output to file + PrintStream output = new PrintStream(new File(version + ".xml"), "UTF-8"); + output.print(header + passageXml + footer); + output.close(); + + } + + static Element getTestament(Scanner scanner, int numBooks) throws IOException{ + Element testament = new Element(Tag.valueOf("div"), "") + .attr("type", "x-testament"); + + for(int i=0; i in page + String id = footnote.select("a").first().attr("href").substring(1); + // Determine letter of footnote + String letter = footnote.select("a").first().text(); + // Transform tag into tag with required attributes + footnote + .tagName("note") + .removeAttr("class").removeAttr("value") + .attr("n", letter) + .attr("osisRef", osisId) + .attr("osisID", osisId + "!" + letter); + // Grab the content of the footnote from the footnotes section + Elements noteContent = passage.select("#" + id); + // The first link in a footnote is not required + noteContent.select("a").first().remove(); + // Subsequent links are to verses, and the text of these must be kept + noteContent.select("a").unwrap(); + // Change italic text to correct tag + noteContent.select("i").tagName("hi").attr("type", "italic"); + footnote.html(noteContent.html()); + } + + // Handle cross-references + for(Element crossref : verse.select(".crossreference")){ + String linkValue = crossref.attr("value"); + // Determine ID of content
  • in page + String id = linkValue.substring(linkValue.indexOf("#") + 1); + id = id.substring(0, id.indexOf("\"")); + // Determine letter of footnote + String letter = linkValue.substring(linkValue.indexOf(">") + 1); + letter = letter.substring(0, letter.indexOf("<")); + // Transform tag into tag with required attributes + crossref + .tagName("note") + .removeAttr("class").removeAttr("value") + .attr("type", "crossReference") + .attr("n", letter) + .attr("osisID", osisId + "!crossReference." + letter); + // Grab the cross-reference IDs from the footnote link + String[] refs = passage.select("#" + id).select("a").get(1).attr("data-bibleref").split(","); + for(int i=0; i tag + Element refEl = createTag("reference").attr("osisRef", ref); + // Change the osisID notation into readable notation, with spaces and :s + refEl.html(ref.replaceFirst("\\.", " ").replaceFirst("\\.", ":").replaceFirst("\\.", " ").replaceFirst("\\.", ":")); + // Append it to the element + crossref.appendChild(refEl); + // Append the semicolon which the OSIS spec dictates + if(i < refs.length - 1) + crossref.append("; "); + } + } + } + + // Convert poetry div to lg + passage.select("div.poetry p").tagName("lg"); + passage.select("lg verse").wrap(""); + passage.select("div.poetry").tagName("p").removeAttr("class"); + passage.select("lg br").remove(); + + // Convert h3s to titles + passage.select("h3 span").unwrap(); + passage.select("h3").tagName("title"); + + // Convert I tags to transchange + passage.select("i").tagName("transchange").attr("type", "added"); + + // Ignore words of Jesus + passage.select(".woj").unwrap(); + + // Remove footnotes / cross-references sections (which have served their purpose) + passage.select(".footnotes, .crossrefs").remove(); + + return passage; + } + + private static void removeComments(Node node) { + for (int i = 0; i < node.childNodes().size();) { + Node child = node.childNode(i); + if (child.nodeName().equals("#comment")) + child.remove(); + else { + removeComments(child); + i++; + } + } + } + + private static Element createTag(String tagName){ + return new Element(Tag.valueOf(tagName), ""); + } + +} diff --git a/scraper/src/nz/co/dewar/biblescraper/MainNoFootnotes.java b/scraper/src/nz/co/dewar/biblescraper/MainNoFootnotes.java new file mode 100644 index 0000000..caa38ee --- /dev/null +++ b/scraper/src/nz/co/dewar/biblescraper/MainNoFootnotes.java @@ -0,0 +1,162 @@ +package nz.co.dewar.biblescraper; + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; +import java.util.Scanner; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.parser.Tag; +import org.jsoup.select.Elements; + +public class MainNoFootnotes { + + static final int NUM_BOOKS_OT = 39; + static final int NUM_BOOKS_NT = 27; + + //static final String source = "D:\\George\\Development\\BibleGateway2OSIS\\{BOOK}-{CHAPTER}-{VERSION}.html"; + static final String source = "http://mobile.biblegateway.com/passage/?search={BOOK}%20{CHAPTER}&version={VERSION}"; + + static String version = "NKJV"; + + public static void main(String[] args) throws Exception{ + + String header = "\n" + + "\n" + + + "\n\n" + + "
    \n" + + "\t\n" + + "\t\t" + version + "\n" + + "\t\n" + + "
    \n\n"; + + String footer = "\n
    \n
    \n"; + + // Get all the books of the old and new testament + Scanner bookScanner = new Scanner(new File("books.dat")); + Element oldTestament = getTestament(bookScanner, NUM_BOOKS_OT); + Element newTestament = getTestament(bookScanner, NUM_BOOKS_NT); + bookScanner.close(); + + // Output passage to String + String passageXml = (oldTestament.outerHtml() + newTestament.outerHtml()) + .replaceAll("osisid", "osisID"); + + // Print output to file + PrintStream output = new PrintStream(new File(version + ".xml"), "UTF-8"); + output.print(header + passageXml + footer); + output.close(); + + } + + static Element getTestament(Scanner scanner, int numBooks) throws IOException{ + Element testament = new Element(Tag.valueOf("div"), "") + .attr("type", "x-testament"); + + for(int i=0; i"); + passage.select("div.poetry").tagName("p").removeAttr("class"); + passage.select("lg br").remove(); + + // Convert h3s to titles + passage.select("h3 span").unwrap(); + passage.select("h3").tagName("title"); + + // Convert I tags to transchange + passage.select("i").tagName("transchange").attr("type", "added"); + + // Ignore words of Jesus + passage.select(".woj").unwrap(); + + + return passage; + } + + private static void removeComments(Node node) { + for (int i = 0; i < node.childNodes().size();) { + Node child = node.childNode(i); + if (child.nodeName().equals("#comment")) + child.remove(); + else { + removeComments(child); + i++; + } + } + } + +} diff --git a/script/build.cmd b/script/build.cmd new file mode 100644 index 0000000..356f2f3 --- /dev/null +++ b/script/build.cmd @@ -0,0 +1,21 @@ +@echo off + +IF [%1]==[] goto usage +IF [%2]==[] goto usage + +IF NOT EXIST %1 mkdir %1 + +sword-utilities-1.6.2\osis2mod %1 %1.xml -z -v %2 +goto end + +:usage +echo ------------------------------------------------------------------------------- +echo build: Build a bible translation +echo ------------------------------------------------------------------------------- +echo. +echo Usage: build ^ ^ +echo. +echo Versification can be KJV or NRSV +echo. + +:end diff --git a/script/push.cmd b/script/push.cmd new file mode 100644 index 0000000..6a9162a --- /dev/null +++ b/script/push.cmd @@ -0,0 +1,19 @@ +@echo off + +IF [%1]==[] goto usage + +adb push %1.conf /mnt/sdcard/Android/data/net.bible.android.activity/files/mods.d/%1.conf +adb push %1 /mnt/sdcard/Android/data/net.bible.android.activity/files/modules/texts/ztext/%1/ + +IF "%2"=="-S" adb shell am start -S -n net.bible.android.activity/.StartupActivity + +goto end + +:usage +echo push: Copy the translation's data files to the phone +echo +echo Usage: push ^ +echo + +:end +