Skip to content
This repository has been archived by the owner on Sep 4, 2024. It is now read-only.

Commit

Permalink
Merge pull request #37 from YaleDHLab/data-collection
Browse files Browse the repository at this point in the history
add utils to collect images for #34 #9 #11 #19 #36 #33
  • Loading branch information
duhaime authored Aug 29, 2019
2 parents 81a46fe + c16a739 commit b715bc9
Show file tree
Hide file tree
Showing 2 changed files with 178 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
images
*-images/
.ipynb_checkpoints
.DS_Store
Expand Down
178 changes: 177 additions & 1 deletion download-images.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,182 @@
"\n",
"download_query_results()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Download Padova Images"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests, os\n",
"\n",
"out_dir = 'padova'\n",
"if not os.path.exists(out_dir):\n",
" os.makedirs(out_dir)\n",
"\n",
"for i in ['recto', 'verso']:\n",
" for j in range(1,28,1):\n",
" num = str(j)\n",
" while len(num) < 3:\n",
" num = '0' + num\n",
" img = '{0}-{1}.jpg'.format(num, i)\n",
" url = 'https://medicaltraditions.org/images/stories/manuscripts/demateriamedica/{0}'.format(img)\n",
" r = requests.get(url)\n",
" open(os.path.join(out_dir, img), 'wb').write(r.content)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Download Schoenberg Images"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests, os\n",
"\n",
"for book_id in ['ljs419', 'ljs062']:\n",
" \n",
" out_dir = os.path.join('schoenberg', book_id)\n",
" if not os.path.exists(out_dir):\n",
" os.makedirs(out_dir)\n",
" \n",
" for i in range(500):\n",
" idx = str(i)\n",
" while len(idx) < 4: idx = '0' + idx\n",
" for j in ['front', 'body']:\n",
" try:\n",
" img = '{0}_{1}{2}'.format(book_id, j, idx)\n",
" url = 'http://images.library.upenn.edu/mrsidsceti/bin/image_jpeg.pl?coll=schoenberg&subcoll={0}&image={1}.sid&level=2'.format(book_id, img)\n",
" r = requests.get(url)\n",
" print(url, len(r.content))\n",
" if len(r.content) > 1000:\n",
" open(os.path.join(out_dir, img + '.jpg'), 'wb').write(r.content)\n",
" except:\n",
" print(' ! could not fetch page', url)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Bax - Italian Herbal"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"\n",
"out_dir = 'bax/italian-herbal/'\n",
"if not os.path.exists(out_dir):\n",
" os.makedirs(out_dir)\n",
"\n",
"# system uses Islandora and given a list of thumbnails like:\n",
"# http://cdi.uvm.edu/islandora/object/uvmcdi%3A55290/pages\n",
"# one can transform each thumbnail url from\n",
"# http://cdi.uvm.edu/islandora/object/uvmcdi%3A55304/datastream/TN/view\n",
"# to another DATASTREAM in Islandora https://wiki.duraspace.org/display/ISLANDORA/APPENDIX+C+-+DATASTREAM+REFERENCE\n",
"# e.g. http://cdi.uvm.edu/islandora/object/uvmcdi%3A55304/datastream/JPG/view\n",
"for page_idx, page in enumerate(range(1, 12, 1)):\n",
" url = 'http://cdi.uvm.edu/islandora/object/uvmcdi%3A55290/pages?page={0}'.format(page)\n",
" html = requests.get(url).text\n",
" soup = BeautifulSoup(html)\n",
" for idx, i in enumerate(soup.select('.islandora-objects-grid')[0].select('img')):\n",
" src = i['src']\n",
" url = src.replace('/TN/', '/JPG/')\n",
" r = requests.get(url)\n",
" img = '{0}-{1}.jpg'.format(page_idx, idx)\n",
" open(os.path.join(out_dir, img), 'wb').write(r.content)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Bax - General History of the Things of New Spain\n",
"Sahagun"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os, requests\n",
"\n",
"out_dir = 'bax/sahagun/'\n",
"if not os.path.exists(out_dir):\n",
" os.makedirs(out_dir)\n",
" \n",
"for i in range(1000):\n",
" url = 'https://content.wdl.org/10622/service/thumbnail/1403114302/1024x1024/1/{0}.jpg'.format(i)\n",
" r = requests.get(url)\n",
" if len(r.content) > 500:\n",
" img = str(i) + '.jpg'\n",
" open(os.path.join(out_dir, img), 'wb').write(r.content)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Bax - Kitab\n",
"ManuscritKitāb 'ağā'ib al-maḫlūqāt wa ġarā'ib ... Qazwīnī, Zakariyyā ibn Muḥammad ibn Maḥmūd al- (1203-1283). Auteur du texte"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os, requests\n",
"\n",
"out_dir = 'bax/kitab/'\n",
"if not os.path.exists(out_dir):\n",
" os.makedirs(out_dir)\n",
"\n",
"for i in range(1000):\n",
" url = 'https://gallica.bnf.fr/ark:/12148/btv1b8406160j/f{0}.medres'.format(i)\n",
" r = requests.get(url)\n",
" if len(r.content) > 500:\n",
" img = str(i) + '.jpg'\n",
" open(os.path.join(out_dir, img), 'wb').write(r.content)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# MS. Canon. Misc. 408"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# url https://iiif.bodleian.ox.ac.uk/iiif/image/c444f7e2-ca30-48ae-87b5-54f93d6ed046/full/full/0/default.jpg\n",
"# info https://iiif.bodleian.ox.ac.uk/iiif/image/c444f7e2-ca30-48ae-87b5-54f93d6ed046/info.json"
]
}
],
"metadata": {
Expand All @@ -243,7 +419,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"version": "3.5.3"
}
},
"nbformat": 4,
Expand Down

0 comments on commit b715bc9

Please sign in to comment.