-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.py
32 lines (22 loc) · 848 Bytes
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import json
import logging.config
import yaml
import traceback
from crawling.CrawlingHandler import CrawlingHandler
logging.config.fileConfig("log.ini", disable_existing_loggers=False)
logger = logging.getLogger(__name__)
if __name__ == "__main__":
with open(".secrets/keys.json", "r", encoding="utf-8") as f:
key_data = json.load(f)
with open("config/keywords_v0.yml", "r", encoding="utf-8") as f:
keywords = yaml.safe_load(f)
doc_types = ["ar", "cp"]
year_range = (1990, 2023) # (both inclusive)
logger.info(f"Starting search with n={len(keywords)} keywords")
crawler = CrawlingHandler(key_data['API_Keys'])
try:
crawler.fetch(keywords, doc_types, year_range)
print(crawler.n_results)
except Exception as e:
logger.error(traceback.format_exc())
raise e