Skip to content
This repository has been archived by the owner on Dec 2, 2022. It is now read-only.

Commit

Permalink
add support for midv-2019 data
Browse files Browse the repository at this point in the history
  • Loading branch information
fcakyon committed Aug 19, 2020
1 parent 6f526b1 commit 3f766d0
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 31 deletions.
49 changes: 37 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,21 @@
[![PyPI version](https://badge.fury.io/py/midv500.svg)](https://badge.fury.io/py/midv500)
![CI](https://github.com/fcakyon/midv500/workflows/CI/badge.svg)

## Download and convert MIDV-500 dataset into COCO instance segmentation format
Automatically download/unzip MIDV-500 dataset and convert the annotations into COCO instance segmentation format.
## Download and convert MIDV-500 datasets into COCO instance segmentation format
Automatically download/unzip [MIDV-500](https://arxiv.org/abs/1807.05786) and [MIDV-2019](https://arxiv.org/abs/1910.04009) datasets and convert the annotations into COCO instance segmentation format.

Then, dataset can be directly used in the training of Yolact, Detectron type of models.

## MIDV-500 Dataset
MIDV-500 consists of 500 video clips for 50 different identity document types with ground truth which allows to perform research in a wide scope of various document analysis problems.
## MIDV-500 Datasets
MIDV-500 consists of 500 video clips for 50 different identity document types including 17 ID cards, 14 passports, 13 driving licences and 6 other identity documents of different countries with ground truth which allows to perform research in a wide scope of various document analysis problems. Additionally, MIDV-2019 dataset contains distorted and low light images in it.

<img width="1000" alt="teaser" src="./figures/midv500.png">

You can find more detail on: [MIDV-500: A Dataset for Identity Documents Analysis and Recognition on Mobile Devices in Video Stream](https://arxiv.org/abs/1807.05786)
You can find more detail on papers:

[MIDV-500: A Dataset for Identity Documents Analysis and Recognition on Mobile Devices in Video Stream](https://arxiv.org/abs/1807.05786)

[MIDV-2019: Challenges of the modern mobile-based document OCR](https://arxiv.org/abs/1910.04009)


## Getting started
Expand All @@ -22,20 +26,41 @@ pip install midv500
```

### Usage

- Import package:

```python
# import package
import midv500
```

- Download and unzip desired version of the dataset:

```python
# set directory for dataset to be downloaded
dataset_dir = 'data/midv500/'
dataset_dir = 'midv500_data/'

# download and unzip midv500 dataset
midv500.download_dataset(dataset_dir)
# download and unzip the base midv500 dataset
dataset_name = "midv500"
midv500.download_dataset(dataset_dir, dataset_name)

# or download and unzip the midv2019 dataset that includes low light images
dataset_name = "midv2019"
midv500.download_dataset(dataset_dir, dataset_name)

# or download and unzip both midv500 and midv2019 datasets
dataset_name = "all"
midv500.download_dataset(dataset_dir, dataset_name)
```

- Convert downloaded dataset to coco format:

```python
# set directory for coco annotations to be saved
export_dir = 'data/midv500/'
export_dir = 'midv500_data/'

# set the desired name of the coco file, coco file will be exported as "filename + '_coco.json'"
filename = 'midv500'

# convert midv500 annotations to coco format
midv500.convert_to_coco(dataset_dir, export_dir)
midv500.convert_to_coco(dataset_dir, export_dir, filename)
```

2 changes: 1 addition & 1 deletion midv500/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import absolute_import

__version__ = "0.1.3"
__version__ = "0.2.0"

from midv500.convert_dataset import convert as convert_to_coco

Expand Down
4 changes: 2 additions & 2 deletions midv500/convert_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
)


def convert(root_dir: str, export_dir: str):
def convert(root_dir: str, export_dir: str, filename: str):
"""
Walks inside root_dir (should oly contain original midv500 dataset folders),
reads all annotations, and creates coco styled annotation file
Expand Down Expand Up @@ -114,7 +114,7 @@ def convert(root_dir: str, export_dir: str):
coco_dict["categories"] = [{"name": "id_card", "id": 1}]

# export coco dict
export_path = os.path.join(export_dir, "midv500_coco.json")
export_path = os.path.join(export_dir, filename + "_coco.json")
with open(export_path, "w") as f:
json.dump(coco_dict, f)

Expand Down
112 changes: 96 additions & 16 deletions midv500/download_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import argparse
from midv500.utils import download, unzip

all_links = [
midv500_links = [
"ftp://smartengines.com/midv-500/dataset/01_alb_id.zip",
"ftp://smartengines.com/midv-500/dataset/02_aut_drvlic_new.zip",
"ftp://smartengines.com/midv-500/dataset/03_aut_id_old.zip",
Expand Down Expand Up @@ -56,24 +56,104 @@
]


def download_dataset(download_dir: str):
midv2019_links = [

This comment has been minimized.

Copy link
@fcakyon

fcakyon Aug 19, 2020

Author Owner

thanks to AlexLi0605

"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/01_alb_id.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/02_aut_drvlic_new.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/03_aut_id_old.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/04_aut_id.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/05_aze_passport.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/06_bra_passport.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/07_chl_id.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/08_chn_homereturn.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/09_chn_id.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/10_cze_id.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/11_cze_passport.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/12_deu_drvlic_new.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/13_deu_drvlic_old.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/14_deu_id_new.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/15_deu_id_old.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/16_deu_passport_new.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/17_deu_passport_old.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/18_dza_passport.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/19_esp_drvlic.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/20_esp_id_new.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/21_esp_id_old.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/22_est_id.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/23_fin_drvlic.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/24_fin_id.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/25_grc_passport.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/26_hrv_drvlic.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/27_hrv_passport.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/28_hun_passport.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/29_irn_drvlic.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/30_ita_drvlic.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/31_jpn_drvlic.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/32_lva_passport.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/33_mac_id.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/34_mda_passport.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/35_nor_drvlic.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/36_pol_drvlic.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/37_prt_id.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/38_rou_drvlic.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/39_rus_internalpassport.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/40_srb_id.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/41_srb_passport.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/42_svk_id.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/43_tur_id.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/44_ukr_id.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/45_ukr_passport.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/46_ury_passport.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/47_usa_bordercrossing.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/48_usa_passportcard.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/49_usa_ssn82.zip",
"ftp://smartengines.com/midv-500/extra/midv-2019/dataset/50_xpo_id.zip",
]


def download_dataset(download_dir: str, dataset_name: str = "midv500"):
"""
This script downloads the MIDV-500 dataset and unzips the folders.
This script downloads the MIDV-500 dataset with extra files and unzips the folders.
dataset_name: str
"midv500": https://arxiv.org/abs/1807.05786
"midv2019": https://arxiv.org/abs/1910.04009
"all": midv500 + midv2019
"""

for link in all_links:
print("--------------------------------------------------------------")
# doownlaod zip file
print("\nDownloading:", link[40:])
download(link, download_dir)
print("Downloaded:", link[40:])
# unzip zip file
print("Unzipping:", link[40:])
zip_path = os.path.join(download_dir, link[40:])
unzip(zip_path, download_dir)
print("Unzipped:", link[40:].replace(".zip", ""))
# remove zip file
os.remove(zip_path)
if dataset_name == "midv500":
links_set = {
"midv500": midv500_links,
}
elif dataset_name == "midv2019":
links_set = {
"midv2019": midv2019_links,
}
elif dataset_name == "all":
links_set = {
"midv500": midv500_links,
"midv2019": midv2019_links,
}
else:
Exception('Invalid dataset_name, try one of "midv500", "midv2019" or "all".')

for k, v in links_set.items():
dst = os.path.join(download_dir, k)
for link in v:
print("--------------------------------------------------------------")
# download zip file
link = link.replace("\\", "/") # for windows
filename = link.split("/")[-1]
print("\nDownloading:", filename)
download(link, dst)
print("Downloaded:", filename)

# unzip zip file
print("Unzipping:", filename)
zip_path = os.path.join(dst, filename)
unzip(zip_path, dst)
print("Unzipped:", filename.replace(".zip", ""))

# remove zip file
os.remove(zip_path)


if __name__ == "__main__":
Expand Down

0 comments on commit 3f766d0

Please sign in to comment.