diff --git a/README.md b/README.md index 6bd3cf5..b29b598 100644 --- a/README.md +++ b/README.md @@ -2,17 +2,21 @@ [![PyPI version](https://badge.fury.io/py/midv500.svg)](https://badge.fury.io/py/midv500) ![CI](https://github.com/fcakyon/midv500/workflows/CI/badge.svg) -## Download and convert MIDV-500 dataset into COCO instance segmentation format -Automatically download/unzip MIDV-500 dataset and convert the annotations into COCO instance segmentation format. +## Download and convert MIDV-500 datasets into COCO instance segmentation format +Automatically download/unzip [MIDV-500](https://arxiv.org/abs/1807.05786) and [MIDV-2019](https://arxiv.org/abs/1910.04009) datasets and convert the annotations into COCO instance segmentation format. Then, dataset can be directly used in the training of Yolact, Detectron type of models. -## MIDV-500 Dataset -MIDV-500 consists of 500 video clips for 50 different identity document types with ground truth which allows to perform research in a wide scope of various document analysis problems. +## MIDV-500 Datasets +MIDV-500 consists of 500 video clips for 50 different identity document types including 17 ID cards, 14 passports, 13 driving licences and 6 other identity documents of different countries with ground truth which allows to perform research in a wide scope of various document analysis problems. Additionally, MIDV-2019 dataset contains distorted and low light images in it. teaser -You can find more detail on: [MIDV-500: A Dataset for Identity Documents Analysis and Recognition on Mobile Devices in Video Stream](https://arxiv.org/abs/1807.05786) +You can find more detail on papers: + +[MIDV-500: A Dataset for Identity Documents Analysis and Recognition on Mobile Devices in Video Stream](https://arxiv.org/abs/1807.05786) + +[MIDV-2019: Challenges of the modern mobile-based document OCR](https://arxiv.org/abs/1910.04009) ## Getting started @@ -22,20 +26,41 @@ pip install midv500 ``` ### Usage + +- Import package: + ```python -# import package import midv500 +``` + +- Download and unzip desired version of the dataset: +```python # set directory for dataset to be downloaded -dataset_dir = 'data/midv500/' +dataset_dir = 'midv500_data/' -# download and unzip midv500 dataset -midv500.download_dataset(dataset_dir) +# download and unzip the base midv500 dataset +dataset_name = "midv500" +midv500.download_dataset(dataset_dir, dataset_name) +# or download and unzip the midv2019 dataset that includes low light images +dataset_name = "midv2019" +midv500.download_dataset(dataset_dir, dataset_name) + +# or download and unzip both midv500 and midv2019 datasets +dataset_name = "all" +midv500.download_dataset(dataset_dir, dataset_name) +``` + +- Convert downloaded dataset to coco format: + +```python # set directory for coco annotations to be saved -export_dir = 'data/midv500/' +export_dir = 'midv500_data/' + +# set the desired name of the coco file, coco file will be exported as "filename + '_coco.json'" +filename = 'midv500' # convert midv500 annotations to coco format -midv500.convert_to_coco(dataset_dir, export_dir) +midv500.convert_to_coco(dataset_dir, export_dir, filename) ``` - diff --git a/midv500/__init__.py b/midv500/__init__.py index 7ff2d56..48badcb 100644 --- a/midv500/__init__.py +++ b/midv500/__init__.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -__version__ = "0.1.3" +__version__ = "0.2.0" from midv500.convert_dataset import convert as convert_to_coco diff --git a/midv500/convert_dataset.py b/midv500/convert_dataset.py index 43adaa3..8d2a33d 100644 --- a/midv500/convert_dataset.py +++ b/midv500/convert_dataset.py @@ -11,7 +11,7 @@ ) -def convert(root_dir: str, export_dir: str): +def convert(root_dir: str, export_dir: str, filename: str): """ Walks inside root_dir (should oly contain original midv500 dataset folders), reads all annotations, and creates coco styled annotation file @@ -114,7 +114,7 @@ def convert(root_dir: str, export_dir: str): coco_dict["categories"] = [{"name": "id_card", "id": 1}] # export coco dict - export_path = os.path.join(export_dir, "midv500_coco.json") + export_path = os.path.join(export_dir, filename + "_coco.json") with open(export_path, "w") as f: json.dump(coco_dict, f) diff --git a/midv500/download_dataset.py b/midv500/download_dataset.py index f348677..779570b 100644 --- a/midv500/download_dataset.py +++ b/midv500/download_dataset.py @@ -2,7 +2,7 @@ import argparse from midv500.utils import download, unzip -all_links = [ +midv500_links = [ "ftp://smartengines.com/midv-500/dataset/01_alb_id.zip", "ftp://smartengines.com/midv-500/dataset/02_aut_drvlic_new.zip", "ftp://smartengines.com/midv-500/dataset/03_aut_id_old.zip", @@ -56,24 +56,104 @@ ] -def download_dataset(download_dir: str): +midv2019_links = [ + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/01_alb_id.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/02_aut_drvlic_new.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/03_aut_id_old.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/04_aut_id.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/05_aze_passport.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/06_bra_passport.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/07_chl_id.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/08_chn_homereturn.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/09_chn_id.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/10_cze_id.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/11_cze_passport.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/12_deu_drvlic_new.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/13_deu_drvlic_old.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/14_deu_id_new.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/15_deu_id_old.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/16_deu_passport_new.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/17_deu_passport_old.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/18_dza_passport.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/19_esp_drvlic.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/20_esp_id_new.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/21_esp_id_old.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/22_est_id.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/23_fin_drvlic.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/24_fin_id.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/25_grc_passport.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/26_hrv_drvlic.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/27_hrv_passport.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/28_hun_passport.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/29_irn_drvlic.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/30_ita_drvlic.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/31_jpn_drvlic.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/32_lva_passport.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/33_mac_id.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/34_mda_passport.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/35_nor_drvlic.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/36_pol_drvlic.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/37_prt_id.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/38_rou_drvlic.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/39_rus_internalpassport.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/40_srb_id.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/41_srb_passport.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/42_svk_id.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/43_tur_id.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/44_ukr_id.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/45_ukr_passport.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/46_ury_passport.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/47_usa_bordercrossing.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/48_usa_passportcard.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/49_usa_ssn82.zip", + "ftp://smartengines.com/midv-500/extra/midv-2019/dataset/50_xpo_id.zip", +] + + +def download_dataset(download_dir: str, dataset_name: str = "midv500"): """ - This script downloads the MIDV-500 dataset and unzips the folders. + This script downloads the MIDV-500 dataset with extra files and unzips the folders. + dataset_name: str + "midv500": https://arxiv.org/abs/1807.05786 + "midv2019": https://arxiv.org/abs/1910.04009 + "all": midv500 + midv2019 """ - for link in all_links: - print("--------------------------------------------------------------") - # doownlaod zip file - print("\nDownloading:", link[40:]) - download(link, download_dir) - print("Downloaded:", link[40:]) - # unzip zip file - print("Unzipping:", link[40:]) - zip_path = os.path.join(download_dir, link[40:]) - unzip(zip_path, download_dir) - print("Unzipped:", link[40:].replace(".zip", "")) - # remove zip file - os.remove(zip_path) + if dataset_name == "midv500": + links_set = { + "midv500": midv500_links, + } + elif dataset_name == "midv2019": + links_set = { + "midv2019": midv2019_links, + } + elif dataset_name == "all": + links_set = { + "midv500": midv500_links, + "midv2019": midv2019_links, + } + else: + Exception('Invalid dataset_name, try one of "midv500", "midv2019" or "all".') + + for k, v in links_set.items(): + dst = os.path.join(download_dir, k) + for link in v: + print("--------------------------------------------------------------") + # download zip file + link = link.replace("\\", "/") # for windows + filename = link.split("/")[-1] + print("\nDownloading:", filename) + download(link, dst) + print("Downloaded:", filename) + + # unzip zip file + print("Unzipping:", filename) + zip_path = os.path.join(dst, filename) + unzip(zip_path, dst) + print("Unzipped:", filename.replace(".zip", "")) + + # remove zip file + os.remove(zip_path) if __name__ == "__main__":