gh-7: adapted to new version of Weaviate

weaviate · Jun 15, 2021 · 498c3db · 498c3db
1 parent cf338b0
commit 498c3db
Show file tree

Hide file tree

Showing 8 changed files with 10,053 additions and 55 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -4,7 +4,6 @@ notifications:
 os: linux
 language: python
 python:
- - "3.7"
  - "3.8"
 
 services:
@@ -15,33 +14,7 @@ install:
  - ci/start_weaviate.sh
 
 script: 
- - python -m unittest test.create_schema_test
- - python -m unittest test.import_data_test
+ - ./loaddata.py
 
 after_script:
- - docker-compose down
-
-# jobs:
-# include:
-# - stage: "Unit tests"
-# name: "3.7"
-# branches:
-# only:
-# - master
-# python: "3.7"
-# install:
-# - pip install -r requirements.txt
-# script:
-# - python -m unittest test.create_schema_test
-# - python -m unittest test.import_data_test
-
-# - name: "3.8"
-# branches:
-# only:
-# - master
-# python: "3.8"
-# install:
-# - pip install -r requirements.txt
-# script:
-# - python -m unittest test.create_schema_test
-# - python -m unittest test.import_data_test
+ - docker-compose -f ./docker/docker-compose.yml down
diff --git a/ci/start_weaviate.sh b/ci/start_weaviate.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 echo "Run Docker compose"
-nohup docker-compose up &
+nohup docker-compose -f ./docker/docker-compose.yml up &
 
 echo "Wait until weaviate is up"
 
@@ -20,4 +20,4 @@ while [ $? -ne 0 ]; do
  fi
  curl localhost:8080/v1/meta
 done
-echo "Weaviate is up and running"
+echo "Weaviate is up and running"
diff --git a/config.yml b/config.yml
@@ -1,17 +1,21 @@
 weaviate:
  url: 'http://localhost:8080'
  schema: './schema/schema.json'
+ #wcs: 'demo-arxiv'
  username: "WEAVIATE_USERNAME"
  password: "WEAVIATE_PASSWORD"
  debug: False
  verbose: True
- max_batch_size: 1000
+ max_batch_size: 50
  overwrite_schema: true
 
 
 data:
  taxanomy: "./data/taxanomy/taxanomy.html"
- #metadata_file: "data/metadata/arxiv-metadata-oai-snapshot.json"
+ metadata_dir: "./data/metadata/"
+
+ #metadata_file: "./data/metadata/arxiv-metadata-oai-snapshot.json"
+ #metadata_file: "https://storage.googleapis.com/semi-technologies-public-data/arxiv-metadata-small.json.zip"
  metadata_file: "./data/metadata/10.json"
  #metadata_file: "./data/metadata/100.json"
  #metadata_file: "./data/metadata/1000.json"

diff --git a/data/metadata/arxiv-metadata-small.json b/data/metadata/arxiv-metadata-small.json
diff --git a/data/metadata/arxiv-metadata-small.json.zip b/data/metadata/arxiv-metadata-small.json.zip
diff --git a/data/metadata/test_data.json b/data/metadata/test_data.json
diff --git a/modules/metadata.py b/modules/metadata.py
@@ -1,36 +1,20 @@
 """ This modules contains general utility functions """
 
+from os import path
+import zipfile
 import json
+import requests
 
 
-def get_metadata(config: dict) -> list:
+def _read_metadata_file(filename: str, max_size: int, skip_n_papers: int) -> list:
  """ converts and returns the arxiv data set from json to a list
-
- :param datafile: the json file location and name
- :type datafile: str
- :param max_size: the maximum number of papers to import, defaults to 1000000000
- :type max_size: int, optional
- :param skip_n_papers: the number of papers to skip, defaults to 0
- :type skip_n_papers: int, optional
- :return: a list of paper objects with metainfo
- :rtype: list
  """
  data = []
  ids = set()
  i = 0
 
- if config is None or 'data' not in config or 'metadata_file' not in config['data']:
- return None
-
- path = config['data']['metadata_file']
- if 'n_papers' in config['data']:
- max_size = config['data']['n_papers']
- skip_n_papers = 0
- if 'skip_n_papers' in config['data']:
- skip_n_papers = config['data']['skip_n_papers']
-
- print("Start loading ArXiv dataset -----------:", path)
- with open(path, 'r') as file:
+ print("Start loading ArXiv dataset -----------:", filename)
+ with open(filename, 'r') as file:
  for line in file:
  if max_size > 0:
  if len(data) >= max_size:
@@ -46,3 +30,39 @@ def get_metadata(config: dict) -> list:
  print('Done loading ArXiv dataset ------------: load {}, skip {}'.format(len(data),i-len(data)))
 
  return data[:max_size]
+
+
+def get_metadata(config: dict) -> list:
+ """ converts and returns the arxiv data set from json to a list
+ """
+
+ if config is None or 'data' not in config or 'metadata_file' not in config['data']:
+ return None
+
+ max_size = -1
+ if 'n_papers' in config['data']:
+ max_size = config['data']['n_papers']
+ skip = 0
+ if 'skip_n_papers' in config['data']:
+ skip = config['data']['skip_n_papers']
+
+ location = config['data']['metadata_file']
+ if path.exists(location):
+ filename = location
+ else:
+ if 'http' in location:
+ print("Downloading ArXiv dataset from --------:", location)
+ response = requests.get(location)
+ download = config['data']['metadata_dir'] + location.split('/')[-1]
+ open(download, 'wb').write(response.content)
+ if download.endswith('.zip'):
+ with zipfile.ZipFile(download, 'r') as zip_ref:
+ if len(zip_ref.namelist()) > 0:
+ filename = config['data']['metadata_dir'] + zip_ref.namelist()[0]
+ zip_ref.extractall(config['data']['metadata_dir'])
+ else:
+ filename = download
+
+ result = _read_metadata_file(filename, max_size, skip)
+
+ return result
diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ pandas==1.1.2
 beautifulsoup4==4.9.3
 python_dateutil==2.8.1
 PyYAML==5.4.1
+google-cloud-storage==1.38.0