From c16a739b340a157bb748c856cdd2a805947fecad Mon Sep 17 00:00:00 2001 From: duhaime Date: Thu, 29 Aug 2019 15:30:33 -0400 Subject: [PATCH] add utils to collect images for #34 #9 #11 #19 #36 #33 --- .gitignore | 1 + download-images.ipynb | 178 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 178 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index e276a35..0380402 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +images *-images/ .ipynb_checkpoints .DS_Store diff --git a/download-images.ipynb b/download-images.ipynb index c8ffd20..f660c78 100644 --- a/download-images.ipynb +++ b/download-images.ipynb @@ -225,6 +225,182 @@ "\n", "download_query_results()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Download Padova Images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests, os\n", + "\n", + "out_dir = 'padova'\n", + "if not os.path.exists(out_dir):\n", + " os.makedirs(out_dir)\n", + "\n", + "for i in ['recto', 'verso']:\n", + " for j in range(1,28,1):\n", + " num = str(j)\n", + " while len(num) < 3:\n", + " num = '0' + num\n", + " img = '{0}-{1}.jpg'.format(num, i)\n", + " url = 'https://medicaltraditions.org/images/stories/manuscripts/demateriamedica/{0}'.format(img)\n", + " r = requests.get(url)\n", + " open(os.path.join(out_dir, img), 'wb').write(r.content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Download Schoenberg Images" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests, os\n", + "\n", + "for book_id in ['ljs419', 'ljs062']:\n", + " \n", + " out_dir = os.path.join('schoenberg', book_id)\n", + " if not os.path.exists(out_dir):\n", + " os.makedirs(out_dir)\n", + " \n", + " for i in range(500):\n", + " idx = str(i)\n", + " while len(idx) < 4: idx = '0' + idx\n", + " for j in ['front', 'body']:\n", + " try:\n", + " img = '{0}_{1}{2}'.format(book_id, j, idx)\n", + " url = 'http://images.library.upenn.edu/mrsidsceti/bin/image_jpeg.pl?coll=schoenberg&subcoll={0}&image={1}.sid&level=2'.format(book_id, img)\n", + " r = requests.get(url)\n", + " print(url, len(r.content))\n", + " if len(r.content) > 1000:\n", + " open(os.path.join(out_dir, img + '.jpg'), 'wb').write(r.content)\n", + " except:\n", + " print(' ! could not fetch page', url)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bax - Italian Herbal" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup\n", + "\n", + "out_dir = 'bax/italian-herbal/'\n", + "if not os.path.exists(out_dir):\n", + " os.makedirs(out_dir)\n", + "\n", + "# system uses Islandora and given a list of thumbnails like:\n", + "# http://cdi.uvm.edu/islandora/object/uvmcdi%3A55290/pages\n", + "# one can transform each thumbnail url from\n", + "# http://cdi.uvm.edu/islandora/object/uvmcdi%3A55304/datastream/TN/view\n", + "# to another DATASTREAM in Islandora https://wiki.duraspace.org/display/ISLANDORA/APPENDIX+C+-+DATASTREAM+REFERENCE\n", + "# e.g. http://cdi.uvm.edu/islandora/object/uvmcdi%3A55304/datastream/JPG/view\n", + "for page_idx, page in enumerate(range(1, 12, 1)):\n", + " url = 'http://cdi.uvm.edu/islandora/object/uvmcdi%3A55290/pages?page={0}'.format(page)\n", + " html = requests.get(url).text\n", + " soup = BeautifulSoup(html)\n", + " for idx, i in enumerate(soup.select('.islandora-objects-grid')[0].select('img')):\n", + " src = i['src']\n", + " url = src.replace('/TN/', '/JPG/')\n", + " r = requests.get(url)\n", + " img = '{0}-{1}.jpg'.format(page_idx, idx)\n", + " open(os.path.join(out_dir, img), 'wb').write(r.content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bax - General History of the Things of New Spain\n", + "Sahagun" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os, requests\n", + "\n", + "out_dir = 'bax/sahagun/'\n", + "if not os.path.exists(out_dir):\n", + " os.makedirs(out_dir)\n", + " \n", + "for i in range(1000):\n", + " url = 'https://content.wdl.org/10622/service/thumbnail/1403114302/1024x1024/1/{0}.jpg'.format(i)\n", + " r = requests.get(url)\n", + " if len(r.content) > 500:\n", + " img = str(i) + '.jpg'\n", + " open(os.path.join(out_dir, img), 'wb').write(r.content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bax - Kitab\n", + "ManuscritKitāb 'ağā'ib al-maḫlūqāt wa ġarā'ib ... Qazwīnī, Zakariyyā ibn Muḥammad ibn Maḥmūd al- (1203-1283). Auteur du texte" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os, requests\n", + "\n", + "out_dir = 'bax/kitab/'\n", + "if not os.path.exists(out_dir):\n", + " os.makedirs(out_dir)\n", + "\n", + "for i in range(1000):\n", + " url = 'https://gallica.bnf.fr/ark:/12148/btv1b8406160j/f{0}.medres'.format(i)\n", + " r = requests.get(url)\n", + " if len(r.content) > 500:\n", + " img = str(i) + '.jpg'\n", + " open(os.path.join(out_dir, img), 'wb').write(r.content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MS. Canon. Misc. 408" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# url https://iiif.bodleian.ox.ac.uk/iiif/image/c444f7e2-ca30-48ae-87b5-54f93d6ed046/full/full/0/default.jpg\n", + "# info https://iiif.bodleian.ox.ac.uk/iiif/image/c444f7e2-ca30-48ae-87b5-54f93d6ed046/info.json" + ] } ], "metadata": { @@ -243,7 +419,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.5.3" } }, "nbformat": 4,