-
Notifications
You must be signed in to change notification settings - Fork 23
/
hooks.py
214 lines (181 loc) · 7.16 KB
/
hooks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import re
from urlwatch import filters
from urlwatch import jobs
from urlwatch import reporters
#
# This model suppresses all the dynamic variation in the state sites
# by manipulating the DOM.
#
# The hard states get their own functions but most of the work is generalized.
#
# It tries to make the minimum set of changes required.
#
# public function is:
# text = regularize_text(text)
#
from typing import List, Union
from lxml import html, etree
def safe_starts_with(val: Union[str, None], prefix: str) -> bool:
if val == None: return False
return val.startswith(prefix)
def safe_contains(val: Union[str, None], prefix: str) -> bool:
if val == None: return False
return prefix in val
def check_title(elem: html.Element, txt: str) -> bool:
titles = elem.xpath('//title')
if titles == None: return False
for t in titles:
if safe_contains(t.text, txt): return True
return False
def regularize_if_la(elem: html.Element) -> bool:
" special case for lousiana "
if not check_title(elem, "Louisiana Department of Health"): return False
def clobber(xelem: html.Element):
if "id" in xelem.attrib: del xelem.attrib["id"]
if "class" in xelem.attrib: del xelem.attrib["class"]
if "aria-label" in xelem.attrib: del xelem.attrib["aria-label"]
if xelem.tag == "script":
xelem.text = "[removed]"
elif xelem.tag == "link":
xelem.attrib["href"] = "[removed]"
xelem.attrib["data-bootloader-hash"] = "[removed]"
elif xelem.tag == "a":
xelem.attrib["href"] = "[removed]"
if "ajaxify" in xelem.attrib: del xelem.attrib["ajaxify"]
elif xelem.tag == "img":
xelem.attrib["src"] = "[removed]"
for ch in xelem: clobber(ch)
clobber(elem)
return True
def regularize_if_co_data(elem: html.Element) -> bool:
" special case for colorado data url "
if not check_title(elem, "Colorado COVID-19 Fast Facts"): return False
def clobber(xelem: html.Element):
if xelem.attrib.get("id"): xelem.attrib["id"] = ""
if xelem.attrib.get("class"): xelem.attrib["class"] = ""
if xelem.tag == "script" and xelem.text != None:
if xelem.attrib.get("nonce") != None:
xelem.attrib["nonce"] = "[removed]"
xelem.text = "[removed]"
elif xelem.tag == "style":
if xelem.attrib.get("nonce") != None:
xelem.attrib["nonce"] = "[removed]"
elif safe_starts_with(xelem.text, ".lst-kix"):
xelem.text = "[removed]"
elif xelem.tag == "img":
if xelem.attrib["alt"] == "Colorado Public Health logo":
xelem.attrib["src"] = "[removed]"
elif xelem.tag == "a":
if safe_contains(xelem.attrib.get("href"), "urldefense.proofpoint.com"):
xelem.attrib["href"] = "[removed]"
for ch in xelem: clobber(ch)
clobber(elem)
return True
def regularize_other(elem: html.Element):
" other cases "
if elem.tag == "input":
# AZ
if elem.attrib.get("type") == "hidden":
elem.attrib["value"] = "[removed]"
elif elem.tag == "div":
# CA
if elem.attrib.get("id") == "DeltaFormDigest":
elem.text = "[removed]"
while len(elem) > 0: elem.remove(elem[0])
# IL
elif safe_starts_with(elem.attrib.get("class"), "view view-tweets"):
elem.attrib["class"] = "[removed]"
# OH
elif safe_contains(elem.attrib.get("class"), " id-"):
elem.attrib["class"] = "[removed]"
elif elem.tag == "script":
# CO
if safe_starts_with(elem.text, "jQuery.extend(Drupal.setting"):
elem.text = "[removed]"
elif safe_starts_with(elem.text, "window.NREUM"):
elem.text = "[removed]"
# OH
elif safe_contains(elem.text, "var WASReqURL = ") or safe_contains(elem.text, "wpModules.theme.WindowUtils"):
elem.text = "[removed]"
elif safe_contains(elem.attrib.get("src"), "/wps/contenthandler"):
elem.attrib["src"] = "/wps/contenthandler"
# KY
elif safe_contains(elem.text, "var formDigestElement = "):
elem.text = "[removed]"
elif safe_contains(elem.text, "RegisterSod("):
elem.text = "[removed]"
# MO and NJ
elif safe_contains(elem.attrib.get("src"), "_Incapsula_Resource"):
elem.attrib["src"] = "/_Incapsula_Resource"
# NE
elif safe_contains(elem.text, "var g_correlationId = '"):
elem.text = "[removed]"
# PA
elif safe_contains(elem.text, "var MSOWebPartPageFormName = 'aspnetForm'"):
elem.text = "[removed]"
# RI
elif safe_contains(elem.text, 'window["blob') or safe_contains(elem.text, 'window["bob'):
elem.text = "[removed]"
# TX
elif safe_starts_with(elem.attrib.get("id"), "EktronScriptBlock"):
elem.attrib["id"] = "EktronScriptBlock"
elem.text = "[removed]"
elif elem.tag == "noscript":
# RI and WA
elem.text = ""
while len(elem) > 0: elem.remove(elem[0])
elif elem.tag == "meta":
# CT
if elem.attrib.get("name") == "VIcurrentDateTime":
elem.attrib["content"] = "[removed]"
elif elem.tag == "link":
# OH
if safe_starts_with(elem.attrib.get("href"), "/wps/portal/gov"):
elem.attrib["id"] = "[removed]"
elem.attrib["href"] = "[removed]"
elif elem.tag == "a":
# OH
if elem.attrib.get("class") == "left-navigation__link":
elem.attrib["href"] = "[removed]"
elem.text = ""
elif elem.tag == "body":
# KY
if safe_starts_with(elem.attrib.get("class"), "brwsr-safari"):
elem.attrib["class"] = "brwsr-safari"
for ch in elem:
regularize_other(ch)
def regularize(data, method, options):
" regularize html content for cov-19 state sites "
if type(data) == bytes:
doc = html.fromstring(data)
elif type(data) == str:
doc = html.fromstring(data.encode())
elif type(data) == html.Element:
doc = data
else:
raise Exception(f"Invalid input type ({type(data)}, should be str, bytes, or html.Element")
if regularize_if_la(doc):
pass
elif regularize_if_co_data(doc):
pass
else:
regularize_other(doc)
if type(data) == bytes:
content = html.tostring(doc)
elif type(data) == str:
content = html.tostring(doc).decode()
return content
class Cov19RegularizeFilter(filters.FilterBase):
"""Remove items that vary with every request for COV19 state reporting sites"""
__kind__ = 'cov19regularize'
def filter(self, data, subfilter=None):
if subfilter is None:
method = 're'
options = {}
elif isinstance(subfilter, dict):
method = subfilter.pop('method')
options = subfilter
elif isinstance(subfilter, str):
method = subfilter
options = {}
return regularize(data, method=method, options=options)