-
Notifications
You must be signed in to change notification settings - Fork 37
/
anthem.py
165 lines (129 loc) · 5.45 KB
/
anthem.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
"""Retrieves Anthem BlueCross Explanation of Benefits (EOB) statements.
Due to automation countermeasures implemented by Anthem, this module is only
semi-automatic: the user must manually login and navigate to the claims page.
This uses the `selenium` Python package in conjunction with `chromedriver` to
scrape the Anthem website.
Configuration:
==============
The following keys may be specified as part of the configuration dict:
- `login_url`: Required. Must be a `str` that specifies the initial URL at
which to start. The user is responsible for manually logging in and
navigating to the claims page.
- `output_directory`: Required. Must be a `str` that specifies the path on the
local filesystem where the output will be written. If the directory does not
exist, it will be created.
- `profile_dir`: Optional. If specified, must be a `str` that specifies the
path to a persistent Chrome browser profile to use. This should be a path
used solely for this single configuration; it should not refer to your normal
browser profile. If not specified, a fresh temporary profile will be used
each time.
- `headless`: Must be set to `False`, since this scraper requires manual input.
Example:
========
def CONFIG_anthem():
return dict(
module='finance_dl.anthem',
login_url='https://anthem.com',
output_directory=os.path.join(data_dir, 'anthem'),
# profile_dir is optional but recommended.
profile_dir=os.path.join(profile_dir, 'anthem'),
# headless must be `False` since manual intervention is required
headless=False,
)
Output format:
==============
For each claim, two files are written to the specified `output_directory`:
`<id>.json` contains a JSON representation of the claim as returned by the
Anthem server, and `<id>.pdf` contains the PDF "Explanation of Benefits"
statement for the claim.
The JSON file contains output of the form:
{
"patient": {
"displayName": "John Smith",
"uniqueId": "123456789",
"allowsAccess": true
},
"provider": "SOME MEDICAL PROVIDER",
"totalCharges": 385,
"serviceDate": "01/02/2017 00:00:00",
"memberResponsibility": 111.05,
"status": "Approved",
"appliedToDeductible": 111.05,
"claimNumber": "2017123AB1234"
}
Interactive shell:
==================
From the interactive shell, type: `self.run()` to start the scraper.
"""
from typing import List, Any
import urllib.parse
import re
import collections
import json
import logging
import datetime
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import bs4
import jsonschema
from atomicwrites import atomic_write
from . import scrape_lib
from . import google_login
logger = logging.getLogger('anthem')
netloc_re = r'^([^\.@]+\.)*anthem.com$'
class Scraper(scrape_lib.Scraper):
def __init__(self, login_url: str, output_directory: str, **kwargs):
super().__init__(use_seleniumrequests=True, **kwargs)
self.login_url = login_url
self.output_directory = output_directory
def login(self):
self.driver.get(self.login_url)
def maybe_get_claims_json(self):
try:
soup = bs4.BeautifulSoup(self.driver.page_source, 'html.parser')
return json.loads(
soup.find(id='claimsJson').text,
object_pairs_hook=collections.OrderedDict)
except:
raise NoSuchElementException
def wait_for_claims_json(self):
logger.info('Please login and navigate to the claims page')
result = self.wait_and_return(self.maybe_get_claims_json,
timeout=500)[0]
logger.info('Claims data found')
return result
def save_documents(self):
if not os.path.exists(self.output_directory):
os.makedirs(self.output_directory)
claims_json = self.wait_for_claims_json()
downloads_needed = []
for claim in claims_json['claims']:
url = claim['eobLinkUrl']
pdf_path = os.path.join(self.output_directory,
claim['claimNumber'] + '.pdf')
json_path = os.path.join(self.output_directory,
claim['claimNumber'] + '.json')
if not os.path.exists(json_path):
with atomic_write(
json_path, mode='w', encoding='utf-8',
newline='\n', overwrite=True) as f:
f.write(json.dumps(claim, indent=' ').strip() + '\n')
if not os.path.exists(pdf_path):
if not claim['eobLinkUrl'].startswith('https:/'): continue
downloads_needed.append((claim['eobLinkUrl'], pdf_path))
for i, (url, pdf_path) in enumerate(downloads_needed):
logger.info('Downloading EOB %d/%d', i + 1, len(downloads_needed))
self.driver.get(url)
download_result, = self.wait_and_return(self.get_downloaded_file)
with atomic_write(pdf_path, mode='wb', overwrite=True) as f:
f.write(download_result[1])
def run(self):
self.login()
self.save_documents()
def run(**kwargs):
scrape_lib.run_with_scraper(Scraper, **kwargs)
def interactive(**kwargs):
return scrape_lib.interact_with_scraper(Scraper, **kwargs)