-
Notifications
You must be signed in to change notification settings - Fork 0
/
scan_routes.py
354 lines (323 loc) · 17.5 KB
/
scan_routes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
"""
@File:scan_routes.py
@Time:2024/8/29
@Auth:Tr0e
@Github:https://github.com/Tr0e
@Description:基于正则表达式,批量自动化解析多项目源代码的路由信息,生成可视化表格,辅助代码审计统计
@版本迭代:
v1.0.0 20240829,首版本开发完成;
v1.1.0 20240830,修复父级路由识别的已知Bug;
v1.2.0 20240831,添加对多个maven项目同时扫描的功能;
v2.0.0 20240906,支持通过参数指定将所有路由集中在一个sheet,新增Project、Annotation(注解)两列;
"""
import os
import re
import argparse
import pandas as pd
from colorama import Fore, init
from openpyxl import load_workbook
# 配置colorama颜色自动重置,否则得手动设置Style.RESET_ALL
init(autoreset=True)
# 统计路由数量的全局变量
route_num = 1
# 正则表达式来匹配Spring的路由注解、方法返回类型、方法名称和参数
mapping_pattern = re.compile(r'@(Path|(Request|Get|Post|Put|Delete|Patch)Mapping)\(')
def extract_request_mapping_value(s):
"""
提取类开头的父级路由,通过@RequestMapping注解中的value字段的值,
可能出现括号中携带除了value/path之外的字段,比如 method = RequestMethod.POST。
也能处理@RequestMapping("/clientSideFiltering/challenge-store")这种格式。
"""
# 匹配三种情况:带有 value、path 或者直接给出的路径
pattern = r'@RequestMapping\(\s*(?:(?:value|path)\s*=\s*"([^"]*)"|"(\/[^"]*)")(?:,.*?)?\)'
match = re.search(pattern, s)
if match:
# 返回匹配到的 'value' 或 'path' 字段,或者直接匹配到的路径
return match.group(1) or match.group(2)
else:
return None
def get_class_parent_route(content):
"""
提取类级别的父级路由
注意有可能会返回None,比如java-sec-code-master里的CommandInject.java
"""
parent_route = None
content_lines = content.split('\n')
public_class_line = None
# 遍历每一行,找到 "public class" 所在的行
for line_number, line in enumerate(content_lines, start=1):
if re.search(r'public class', line):
public_class_line = line_number
break
if public_class_line is not None:
# 提取 "public class" 之前的行
content_before_public_class = content_lines[:public_class_line]
for line in content_before_public_class:
if re.search(r'@RequestMapping\(', line):
parent_route = extract_request_mapping_value(line)
return parent_route, public_class_line
def extract_value_between_quotes(line):
"""
提取字符串中第一个""中间的值,目的是提取@GetMapping("/upload")格式中的路由值(尚待解决的是部分项目的路由值是通过一个常量类集中定义的)
"""
pattern = r'"(.*?)"'
match = re.search(pattern, line)
if match:
value = match.group(1)
return value
else:
return None
def extract_function_details(function_def):
"""
从函数定义的行级代码,解析并返回一个函数的详细信息,包括返回类型、函数名、参数等
"""
# print(function_def)
pattern = re.compile(
# r'public\s+(?:@\w+\s+)*([\w<>\[\],\s]+)\s+(\w+)\s*\((.*)\)'
r'public\s+(?:@\w+(?:\([^)]*\))?\s+)*([\w<>\[\],\s\?]+)\s+(\w+)\s*\((.*)\)\s*(?:throws\s+[\w<>,\s]+)?\s*{?'
)
# 匹配函数签名
match = pattern.search(function_def)
if match:
return_type = match.group(1) # 返回类型
function_name = match.group(2) # 函数名
parameters = match.group(3) # 参数
return return_type, function_name, parameters
else:
return None, None, None
def find_constant_value(folder_path, constant_name):
"""
提取出路由的常量值
"""
for root, dirs, files in os.walk(folder_path):
for file in files:
if file.endswith('.java'):
file_path = os.path.join(root, file)
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
pattern = fr'static\s+final\s+String\s+{constant_name}\s*=\s*"(.*?)";'
match = re.search(pattern, content)
if match:
return match.group(1)
return None
def get_path_value(line, directory):
"""
提取出路由的值,适配通过字符串直接提供的路由值,或者通过常量提供的路由值,比如:
@GetMapping(path = "/server-directory")、@GetMapping(path = URL_HINTS_MVC, produces = "application/json")、@GetMapping(value = "/server-directory")
"""
pattern = r'\((?:path|value)\s*=\s*(?:"([^"]*)"|([A-Z_]+))'
matches = re.findall(pattern, line)
route = ''
for match in matches:
if match[0]: # 提取出path为字符串的值
route = match[0]
# print(Fore.GREEN + route)
elif match[1]: # 提取出path为常量的值
route = find_constant_value(directory, match[1])
# print(Fore.BLUE + route)
return route
def get_request_type(route_define):
"""
从路由定义的注解中,提取出API请求类型,比如GET、POST等
"""
# print(route_define)
if route_define.startswith('@RequestMapping'):
# 提取@RequestMapping注解中的method字段的值
if route_define.find('method =') > -1:
request_type = (str(route_define.split('method =')[1]).split('}')[0].strip().replace('{', '').replace(')', '')).replace('RequestMethod.', '')
# 未指定具体请求类型的RequestMapping注解,则默认为支持所有请求类型
else:
request_type = 'All'
else:
request_type = route_define.split('Mapping')[0][1:]
return request_type
def extract_context_path(directory):
"""
从application.properties或xxx.yml等Java项目配置文件中提取上下文路径
"""
for dirPath, dirNames, fileNames in os.walk(directory):
for filename in fileNames:
if filename.endswith(".properties") or filename.endswith('.yml') or filename.endswith('.yaml'):
file_path = os.path.join(dirPath, filename)
with open(file_path, 'r', encoding='utf-8') as data:
data = data.readlines()
for line in data:
# 匹配 properties 文件
if line.startswith('server.servlet.context-path'):
context = line.split('=')[1].strip()
print(Fore.BLUE + "[*]Found context-path:" + context)
return context
# 匹配 yml 文件
elif line.find('context-path') > -1:
context = line.strip().split(':')[1].strip()
print(Fore.BLUE + "[*]Found context-path:" + context)
return context
else:
continue
return None
def extract_routes_from_file(file_path, directory, folder, context):
routes = []
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# 找到Controller注解对应的Controller类
if re.search('@(?!(ControllerAdvice))(|Rest)Controller', content):
parent_route, public_class_line = get_class_parent_route(content)
content_lines = content.split('\n')
# 提取类名定义所在行后的所有代码
content_after_public_class = content_lines[public_class_line:]
global route_num
for i, line in enumerate(content_after_public_class):
try:
if re.search(mapping_pattern, line):
route_define = line.strip()
# 如果路由映射的定义逻辑在一行代码中完全覆盖
if route_define.endswith(')'):
route_define = route_define
# 如果路由映射的定义逻辑在多行代码中才覆盖
else:
q = i + 1
while q < len(content_after_public_class) and not content_after_public_class[q].strip().endswith(')'):
route_define += '' + content_after_public_class[q].strip()
q += 1
route_define += '' + content_after_public_class[q].strip()
# print(Fore.RED + route_define)
# 判断下路由信息是通过字符串字节提供的,还是通过常量提供的,然后统一提取出字符串值
if re.search(r'\("', route_define):
route = extract_value_between_quotes(route_define)
else:
route = get_path_value(route_define, directory)
# 获取完整的一条路由信息
if parent_route is not None and route is not None:
route = parent_route + route
# 向下遍历找到函数的定义的首行代码位置,此处考虑了路由注解下方可能还携带多个其它用途的注解
method_start_line = i + 1
while method_start_line < len(content_after_public_class) and not content_after_public_class[method_start_line].strip().startswith('public'):
method_start_line += 1
method_define = content_after_public_class[method_start_line].strip()
# 获取函数定义的行级代码,考虑函数定义可能跨越多行,需进行代码合并,获得完整的函数定义,否则可能导致函数参数提取残缺
method_end_line = method_start_line
while method_start_line < len(content_after_public_class) and not content_after_public_class[method_end_line].strip().endswith('{'):
method_end_line += 1
method_define = method_define + '' + content_after_public_class[method_end_line].strip()
# print(route)
# print(method_define)
# 解析函数定义的返回值、函数名、函数参数
return_type, function_name, parameters = extract_function_details(method_define)
# 获得函数对应的所有注解信息,查找函数定义所在行上方的第一个空行之前的所有数据
annotation = []
for j in range(method_start_line - 1, -1, -1): # 从当前行往上查找
if content_after_public_class[j].strip() == '': # 空行
break
annotation.append(content_after_public_class[j].strip()) # 添加当前行到结果列表
annotation.reverse()
# print(Fore.RED + str(annotation))
route_info = {
'project': folder,
'context': context,
'file': str(file_path.split('\\')[-1]).split('.java')[0],
'parent_route': parent_route,
'route': route,
'request': get_request_type(route_define),
'return_type': return_type,
'method_name': function_name,
'parameters': parameters,
'annotation': str(annotation),
'file_path': file_path,
}
routes.append(route_info)
print(Fore.GREEN + '[%s]' % str(route_num) + str(route_info))
route_num += 1
except Exception as e:
print(Fore.RED + '[-]' + str(file) + ' ' + str(e))
continue
return routes
def find_all_pom_files(directory):
"""
遍历寻找指定目录下的所有 pom.xml 文件,并建立一个字典,存储文件夹名称和其绝对路径。
"""
pom_dict = {} # 初始化字典存储结果
# 遍历目录及其所有子目录
for dirPath, dirNames, fileNames in os.walk(directory):
# 检查当前目录中的是否包含pom.xml文件和src子文件夹,需同时满足才认为当前是个maven项目根路径
if 'pom.xml' in fileNames and 'src' in dirNames:
# 获取目录名称
folder_name = os.path.basename(dirPath)
# 将目录名称和绝对路径添加到字典中
pom_dict[folder_name] = os.path.abspath(dirPath)
return pom_dict
def write_routes_to_xlsx(all_data_list, folder_name, sheet_rule):
"""
将路由信息写入Excel文件
"""
dataSource = {
"Project": [item['project'] for item in all_data_list],
"Context": [item['context'] for item in all_data_list],
"File": [item['file'] for item in all_data_list],
"Parent Route": [item['parent_route'] for item in all_data_list],
"Route": [item['route'] for item in all_data_list],
"Request": [item['request'] for item in all_data_list],
"Method Name": [item['method_name'] for item in all_data_list],
"Return Type": [item['return_type'] for item in all_data_list],
"Parameters": [item['parameters'] for item in all_data_list],
"Annotation": [item['annotation'] for item in all_data_list],
"File Path": [item['file_path'] for item in all_data_list],
}
if not os.path.exists('Data.xlsx'):
# 本地xlsx文件不存在,创建一个全新的工作簿
with pd.ExcelWriter('Data.xlsx', engine='openpyxl') as writer:
# 创建一个新的 Sheet
dataFrame = pd.DataFrame(dataSource)
if sheet_rule == 'one':
dataFrame.to_excel(writer, sheet_name='all_routes', index=False)
elif sheet_rule == 'more':
dataFrame.to_excel(writer, sheet_name=folder_name, index=False)
else:
existing_book = load_workbook('Data.xlsx')
with pd.ExcelWriter('Data.xlsx', engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
# 获取已有工作簿的工作表字典
existing_worksheets = existing_book.worksheets
existing_sheet_names = [ws.title for ws in existing_worksheets]
new_dataFrame = pd.DataFrame(dataSource)
# 如果目标工作表不存在,则创建新的工作表并写入数据
if sheet_rule == 'more' and folder_name not in existing_sheet_names:
new_dataFrame.to_excel(writer, sheet_name=folder_name, index=False)
elif sheet_rule == 'one':
# 如果目标工作表已存在,加载该工作表并追加数据
new_df_without_index = new_dataFrame.copy()
new_df_without_index.reset_index(drop=True, inplace=True)
combined_df = pd.concat([pd.read_excel('Data.xlsx', sheet_name='all_routes'), new_df_without_index], ignore_index=False)
combined_df.to_excel(writer, sheet_name='all_routes', index=False)
print(Fore.BLUE + "[*]Successfully saved data to xlsx!")
def scan_project_directory(directory, sheet_rule):
# 判断本路径下Data.xlsx文件是否存在,存在的话先删除
if os.path.exists('Data.xlsx'):
os.remove('Data.xlsx')
pom_files = find_all_pom_files(directory)
# 遍历所有pom.xml文件对应的项目路径,收集路由信息
for folder_name, folder_path in pom_files.items():
print(Fore.YELLOW + f"[+]Folder Name: {folder_name}, Folder Path: {folder_path}")
context = extract_context_path(folder_path)
all_routes = []
for root, _, files in os.walk(folder_path):
for file in files:
if file.endswith('.java'):
file_path = os.path.join(root, file)
routes = extract_routes_from_file(file_path, folder_path, folder_name, context)
if routes:
all_routes.extend(routes)
# 判断当前携带pom.xml文件的项目文件夹下提取的路由信息字典是否为空
if all_routes:
write_routes_to_xlsx(all_routes, folder_name, sheet_rule)
else:
print(Fore.RED + f"[-]No routes found in this project path: {folder_path}/{folder_name}…")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-sheet',
help='Use "one" or "more" to specify whether the generated routing information is concentrated in one sheet or divided into multiple sheets by project.')
parser.add_argument('-dir',
help='Specify the code path to be scanned')
args = parser.parse_args()
if args.sheet is not None and args.dir is not None:
scan_project_directory(args.dir, args.sheet.lower())
else:
print(Fore.RED + "[!]Please fully specify the -sheet and -dir parameters and values. For help, try the -h parameter.")