-
Notifications
You must be signed in to change notification settings - Fork 2
/
marc2post
executable file
·195 lines (170 loc) · 7.31 KB
/
marc2post
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/usr/bin/env python
import json
import re
import sys
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
from os import listdir, path
from os.path import isdir, isfile, join
from time import gmtime, strftime
import yaml
from modules.config_parser import args
from pymarc import Field, MARCReader, XMLWriter
from pymarc.marcxml import record_to_xml, record_to_xml_node
from requests_futures import sessions
# from multiprocessing.dummy import Pool as ThreadPool
config = yaml.safe_load(open(args.config))
print()
print("Config:")
print(config)
print()
jobconfig = config["marc2post"]
print("Job config:")
print(jobconfig)
print()
today = datetime.now()
yesterday = today - timedelta(1) #Yesterday's date.
startdate = yesterday # Set the startdate to yesterday, unless overwritten by parameter.
if args.since != "":
startdate = datetime.strptime(args.since, "%Y-%m-%d")
# Empty sync report
load_report = {}
# Establish job start time.
st = datetime.now()
starttime = st.isoformat()[:19]
load_report["job_start"] = starttime
load_report["loaded_records_success"] = 0
load_report["loaded_records_fail"] = 0
load_report["unsuppressed"] = 0
load_report["suppressed"] = 0
load_report["deletes"] = 0
date_format = "%Y-%m-%d"
yesterday_formatted = datetime.strftime(yesterday, date_format)
startdate_formatted = datetime.strftime(startdate, date_format)
dates = [startdate_formatted]
if (yesterday_formatted != startdate_formatted):
nextday = startdate
nextday_formatted_sourcedate = datetime.strftime(startdate, date_format)
while nextday_formatted_sourcedate != yesterday_formatted:
nextday = nextday + timedelta(days=1)
nextday_formatted_sourcedate = datetime.strftime(nextday, date_format)
dates.append(nextday_formatted_sourcedate)
print(dates)
days = {}
for d in dates:
files = []
for s in jobconfig["sources"]:
file_date = datetime.strftime(datetime.strptime(d, "%Y-%m-%d"), s["%SOURCEDATE%"])
file = s["source_directory"] + s["file_pattern"].replace('%SOURCEDATE%', file_date)
if path.exists(file):
files.append(file)
days[d] = {}
days[d]["files"] = files
# Got here so let's set up some MARC processing info.
def process_ingest_response(f):
global load_report
try:
r = {
"status_code": f.result().status_code,
"output": f.result().content
}
if "Location" not in f.result().headers:
# Probably have an error of some kind.
load_report["loaded_records_fail"] += 1
print("No 'location' header found in response. Possible error: {}".format(f.result().content))
print("Source record: {}".format(f.result().request.body))
else:
load_report["loaded_records_success"] += 1
r["uri"] = f.result().headers["Location"]
print("Ingest status for {}: {}".format(r["uri"], r["status_code"]))
except:
print('Error processing requests-future result.')
e_type = sys.exc_info()[0]
e_value = str(sys.exc_info()[1])
e_traceback = sys.exc_info()[2]
print('Error processing requests-future result, e_type: {}'.format(e_type))
print('Error processing requests-future result, e_value: {}'.format(e_value))
print('Error processing requests-future result, e_traceback: {}'.format(e_traceback))
find_fromoldcatalog = re.compile(re.escape('[from old catalog]'), re.IGNORECASE)
auth=(jobconfig["target"]["username"], jobconfig["target"]["password"])
headers={"Content-type": "application/xml"}
session = sessions.FuturesSession(max_workers=jobconfig["threads"])
for d in days:
print("Processing date: " + d)
for f in days[d]["files"]:
print("Processing file (" + d + "): " + f)
marcxml_records = []
with open(f, 'rb') as fh:
reader = MARCReader(fh)
for r in reader:
try:
# Add the suppression status to the record
if 'unsuppressed' not in f or 'delete' in f:
r.add_field(
Field(
tag = '993',
indicators = [' ', ' '],
subfields = [
'a','opacsuppress',
'd', datetime.today().strftime('%Y-%m-%d')
]
)
)
bytesxml = record_to_xml_node(r, False, True)
# This is necessary otherwise the default output is to encode
# special characters using Ӓ convention.
# https://stackoverflow.com/questions/15304229/convert-python-elementtree-to-string
bytesxml = ET.tostring(bytesxml, encoding="unicode")
bytesxml = find_fromoldcatalog.sub('', bytesxml)
marcxml_records.append(bytesxml)
except:
print('Error generating MARC/XML.')
if r is not None:
print('Error record 001: ' + str(r['001']))
load_report["loaded_records_fail"] += 1
e_type = sys.exc_info()[0]
e_value = str(sys.exc_info()[1])
e_traceback = sys.exc_info()[2]
print('Error generating MARC/XML, e_type: {}'.format(e_type))
print('Error generating MARC/XML, e_value: {}'.format(e_value))
print('Error generating MARC/XML, e_traceback: {}'.format(e_traceback))
# 2024-06-18: `suppression` was moved into the MARC record above removing the need to pass it as a URL paramter
# -Franz Osorio
if 'delete' in f:
load_report["deletes"] += len(marcxml_records)
# suppress_status = "deleted" #Suppressing seems to make sense, though not quite right.
elif 'unsuppressed' not in f:
load_report["suppressed"] += len(marcxml_records)
# suppress_status = "suppressed"
else:
load_report["unsuppressed"] += len(marcxml_records)
# suppress_status = "unsuppressed"
futures = [
session.post(jobconfig["target"]["endpoint"], auth=auth, headers=headers, data=r.encode('utf-8'))
for r in marcxml_records
]
results = [
process_ingest_response(f)
for f in futures
]
et = datetime.now()
endtime = et.isoformat()[:19]
timedelta = et - st
load_report["job_end"] = endtime
load_report["status"] = "success"
print('************')
print('************')
print('Job started at: {}'.format(starttime))
print('Job ended at: {}'.format(endtime))
print('Elapsed time: {}'.format(timedelta))
print('Start date of loaded records: {}'.format(startdate_formatted))
print('Number of days loaded: {}'.format(len(dates)))
print('Number of records loaded successfully: {}'.format(load_report["loaded_records_success"]))
print('Number of records load failures: {}'.format(load_report["loaded_records_fail"]))
print('Number of unsuppressed records: {}'.format(load_report["unsuppressed"]))
print('Number of suppressed records: {}'.format(load_report["suppressed"]))
print('Number of deleted records: {}'.format(load_report["deletes"]))
print('************')
print('************')
print()
print()