Merge pull request #37 from YaleDHLab/data-collection

add utils to collect images for #34 #9 #11 #19 #36 #33
YaleDHLab · Aug 29, 2019 · b715bc9 · b715bc9
2 parents 81a46fe + c16a739
commit b715bc9
Show file tree

Hide file tree

Showing 2 changed files with 178 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+images
 *-images/
 .ipynb_checkpoints
 .DS_Store

diff --git a/download-images.ipynb b/download-images.ipynb
@@ -225,6 +225,182 @@
     "\n",
     "download_query_results()"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download Padova Images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests, os\n",
+    "\n",
+    "out_dir = 'padova'\n",
+    "if not os.path.exists(out_dir):\n",
+    "  os.makedirs(out_dir)\n",
+    "\n",
+    "for i in ['recto', 'verso']:\n",
+    "  for j in range(1,28,1):\n",
+    "    num = str(j)\n",
+    "    while len(num) < 3:\n",
+    "      num = '0' + num\n",
+    "    img = '{0}-{1}.jpg'.format(num, i)\n",
+    "    url = 'https://medicaltraditions.org/images/stories/manuscripts/demateriamedica/{0}'.format(img)\n",
+    "    r = requests.get(url)\n",
+    "    open(os.path.join(out_dir, img), 'wb').write(r.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download Schoenberg Images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests, os\n",
+    "\n",
+    "for book_id in ['ljs419', 'ljs062']:\n",
+    "  \n",
+    "  out_dir = os.path.join('schoenberg', book_id)\n",
+    "  if not os.path.exists(out_dir):\n",
+    "    os.makedirs(out_dir)\n",
+    "  \n",
+    "  for i in range(500):\n",
+    "    idx = str(i)\n",
+    "    while len(idx) < 4: idx = '0' + idx\n",
+    "    for j in ['front', 'body']:\n",
+    "      try:\n",
+    "        img = '{0}_{1}{2}'.format(book_id, j, idx)\n",
+    "        url = 'http://images.library.upenn.edu/mrsidsceti/bin/image_jpeg.pl?coll=schoenberg&subcoll={0}&image={1}.sid&level=2'.format(book_id, img)\n",
+    "        r = requests.get(url)\n",
+    "        print(url, len(r.content))\n",
+    "        if len(r.content) > 1000:\n",
+    "          open(os.path.join(out_dir, img + '.jpg'), 'wb').write(r.content)\n",
+    "      except:\n",
+    "        print(' ! could not fetch page', url)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Bax - Italian Herbal"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bs4 import BeautifulSoup\n",
+    "\n",
+    "out_dir = 'bax/italian-herbal/'\n",
+    "if not os.path.exists(out_dir):\n",
+    "  os.makedirs(out_dir)\n",
+    "\n",
+    "# system uses Islandora and given a list of thumbnails like:\n",
+    "# http://cdi.uvm.edu/islandora/object/uvmcdi%3A55290/pages\n",
+    "# one can transform each thumbnail url from\n",
+    "# http://cdi.uvm.edu/islandora/object/uvmcdi%3A55304/datastream/TN/view\n",
+    "# to another DATASTREAM in Islandora https://wiki.duraspace.org/display/ISLANDORA/APPENDIX+C+-+DATASTREAM+REFERENCE\n",
+    "# e.g. http://cdi.uvm.edu/islandora/object/uvmcdi%3A55304/datastream/JPG/view\n",
+    "for page_idx, page in enumerate(range(1, 12, 1)):\n",
+    "  url = 'http://cdi.uvm.edu/islandora/object/uvmcdi%3A55290/pages?page={0}'.format(page)\n",
+    "  html = requests.get(url).text\n",
+    "  soup = BeautifulSoup(html)\n",
+    "  for idx, i in enumerate(soup.select('.islandora-objects-grid')[0].select('img')):\n",
+    "    src = i['src']\n",
+    "    url = src.replace('/TN/', '/JPG/')\n",
+    "    r = requests.get(url)\n",
+    "    img = '{0}-{1}.jpg'.format(page_idx, idx)\n",
+    "    open(os.path.join(out_dir, img), 'wb').write(r.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Bax - General History of the Things of New Spain\n",
+    "Sahagun"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, requests\n",
+    "\n",
+    "out_dir = 'bax/sahagun/'\n",
+    "if not os.path.exists(out_dir):\n",
+    "  os.makedirs(out_dir)\n",
+    "  \n",
+    "for i in range(1000):\n",
+    "  url = 'https://content.wdl.org/10622/service/thumbnail/1403114302/1024x1024/1/{0}.jpg'.format(i)\n",
+    "  r = requests.get(url)\n",
+    "  if len(r.content) > 500:\n",
+    "    img = str(i) + '.jpg'\n",
+    "    open(os.path.join(out_dir, img), 'wb').write(r.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Bax - Kitab\n",
+    "ManuscritKitāb 'ağā'ib al-maḫlūqāt wa ġarā'ib ... Qazwīnī, Zakariyyā ibn Muḥammad ibn Maḥmūd al- (1203-1283). Auteur du texte"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, requests\n",
+    "\n",
+    "out_dir = 'bax/kitab/'\n",
+    "if not os.path.exists(out_dir):\n",
+    "  os.makedirs(out_dir)\n",
+    "\n",
+    "for i in range(1000):\n",
+    "  url = 'https://gallica.bnf.fr/ark:/12148/btv1b8406160j/f{0}.medres'.format(i)\n",
+    "  r = requests.get(url)\n",
+    "  if len(r.content) > 500:\n",
+    "    img = str(i) + '.jpg'\n",
+    "    open(os.path.join(out_dir, img), 'wb').write(r.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# MS. Canon. Misc. 408"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# url https://iiif.bodleian.ox.ac.uk/iiif/image/c444f7e2-ca30-48ae-87b5-54f93d6ed046/full/full/0/default.jpg\n",
+    "# info https://iiif.bodleian.ox.ac.uk/iiif/image/c444f7e2-ca30-48ae-87b5-54f93d6ed046/info.json"
+   ]
   }
  ],
  "metadata": {
@@ -243,7 +419,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.5"
+   "version": "3.5.3"
   }
  },
  "nbformat": 4,