Skip to content

Commit

Permalink
gh-7: adapted to new version of Weaviate
Browse files Browse the repository at this point in the history
  • Loading branch information
michaverhagen committed Jun 15, 2021
1 parent cf338b0 commit 498c3db
Show file tree
Hide file tree
Showing 8 changed files with 10,053 additions and 55 deletions.
31 changes: 2 additions & 29 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ notifications:
os: linux
language: python
python:
- "3.7"
- "3.8"

services:
Expand All @@ -15,33 +14,7 @@ install:
- ci/start_weaviate.sh

script:
- python -m unittest test.create_schema_test
- python -m unittest test.import_data_test
- ./loaddata.py

after_script:
- docker-compose down

# jobs:
# include:
# - stage: "Unit tests"
# name: "3.7"
# branches:
# only:
# - master
# python: "3.7"
# install:
# - pip install -r requirements.txt
# script:
# - python -m unittest test.create_schema_test
# - python -m unittest test.import_data_test

# - name: "3.8"
# branches:
# only:
# - master
# python: "3.8"
# install:
# - pip install -r requirements.txt
# script:
# - python -m unittest test.create_schema_test
# - python -m unittest test.import_data_test
- docker-compose -f ./docker/docker-compose.yml down
4 changes: 2 additions & 2 deletions ci/start_weaviate.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

echo "Run Docker compose"
nohup docker-compose up &
nohup docker-compose -f ./docker/docker-compose.yml up &

echo "Wait until weaviate is up"

Expand All @@ -20,4 +20,4 @@ while [ $? -ne 0 ]; do
fi
curl localhost:8080/v1/meta
done
echo "Weaviate is up and running"
echo "Weaviate is up and running"
8 changes: 6 additions & 2 deletions config.yml
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
weaviate:
url: 'http://localhost:8080'
schema: './schema/schema.json'
#wcs: 'demo-arxiv'
username: "WEAVIATE_USERNAME"
password: "WEAVIATE_PASSWORD"
debug: False
verbose: True
max_batch_size: 1000
max_batch_size: 50
overwrite_schema: true


data:
taxanomy: "./data/taxanomy/taxanomy.html"
#metadata_file: "data/metadata/arxiv-metadata-oai-snapshot.json"
metadata_dir: "./data/metadata/"

#metadata_file: "./data/metadata/arxiv-metadata-oai-snapshot.json"
#metadata_file: "https://storage.googleapis.com/semi-technologies-public-data/arxiv-metadata-small.json.zip"
metadata_file: "./data/metadata/10.json"
#metadata_file: "./data/metadata/100.json"
#metadata_file: "./data/metadata/1000.json"
Expand Down
10,000 changes: 10,000 additions & 0 deletions data/metadata/arxiv-metadata-small.json

Large diffs are not rendered by default.

Binary file added data/metadata/arxiv-metadata-small.json.zip
Binary file not shown.
Empty file removed data/metadata/test_data.json
Empty file.
64 changes: 42 additions & 22 deletions modules/metadata.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,20 @@
""" This modules contains general utility functions """

from os import path
import zipfile
import json
import requests


def get_metadata(config: dict) -> list:
def _read_metadata_file(filename: str, max_size: int, skip_n_papers: int) -> list:
""" converts and returns the arxiv data set from json to a list
:param datafile: the json file location and name
:type datafile: str
:param max_size: the maximum number of papers to import, defaults to 1000000000
:type max_size: int, optional
:param skip_n_papers: the number of papers to skip, defaults to 0
:type skip_n_papers: int, optional
:return: a list of paper objects with metainfo
:rtype: list
"""
data = []
ids = set()
i = 0

if config is None or 'data' not in config or 'metadata_file' not in config['data']:
return None

path = config['data']['metadata_file']
if 'n_papers' in config['data']:
max_size = config['data']['n_papers']
skip_n_papers = 0
if 'skip_n_papers' in config['data']:
skip_n_papers = config['data']['skip_n_papers']

print("Start loading ArXiv dataset -----------:", path)
with open(path, 'r') as file:
print("Start loading ArXiv dataset -----------:", filename)
with open(filename, 'r') as file:
for line in file:
if max_size > 0:
if len(data) >= max_size:
Expand All @@ -46,3 +30,39 @@ def get_metadata(config: dict) -> list:
print('Done loading ArXiv dataset ------------: load {}, skip {}'.format(len(data),i-len(data)))

return data[:max_size]


def get_metadata(config: dict) -> list:
""" converts and returns the arxiv data set from json to a list
"""

if config is None or 'data' not in config or 'metadata_file' not in config['data']:
return None

max_size = -1
if 'n_papers' in config['data']:
max_size = config['data']['n_papers']
skip = 0
if 'skip_n_papers' in config['data']:
skip = config['data']['skip_n_papers']

location = config['data']['metadata_file']
if path.exists(location):
filename = location
else:
if 'http' in location:
print("Downloading ArXiv dataset from --------:", location)
response = requests.get(location)
download = config['data']['metadata_dir'] + location.split('/')[-1]
open(download, 'wb').write(response.content)
if download.endswith('.zip'):
with zipfile.ZipFile(download, 'r') as zip_ref:
if len(zip_ref.namelist()) > 0:
filename = config['data']['metadata_dir'] + zip_ref.namelist()[0]
zip_ref.extractall(config['data']['metadata_dir'])
else:
filename = download

result = _read_metadata_file(filename, max_size, skip)

return result
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ pandas==1.1.2
beautifulsoup4==4.9.3
python_dateutil==2.8.1
PyYAML==5.4.1
google-cloud-storage==1.38.0

0 comments on commit 498c3db

Please sign in to comment.