Skip to content

Commit

Permalink
URL pattern detect
Browse files Browse the repository at this point in the history
  • Loading branch information
NaiboWang committed Jul 5, 2023
1 parent a0dbaea commit f84eb67
Show file tree
Hide file tree
Showing 14 changed files with 150 additions and 60 deletions.
8 changes: 1 addition & 7 deletions ElectronJS/config.json
Original file line number Diff line number Diff line change
@@ -1,7 +1 @@
{
"webserver_address": "http://localhost",
"webserver_port": 8074,
"user_data_folder": "./user_data",
"debug": true,
"absolute_user_data_folder": "D:\\Document\\Projects\\EasySpider\\ElectronJS\\user_data"
}
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":true,"absolute_user_data_folder":"D:\\Document\\Projects\\EasySpider\\ElectronJS\\user_data"}
2 changes: 1 addition & 1 deletion ElectronJS/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ function handleOpenBrowser(event, lang = "en", user_data_folder = "", mobile = f
runBrowser(lang, user_data_folder, mobile);
let size = screen.getPrimaryDisplay().workAreaSize;
let width = parseInt(size.width);
let height = parseInt(size.height * 0.65);
let height = parseInt(size.height * 0.6);
flowchart_window = new BrowserWindow({
x: 0,
y: 0,
Expand Down
22 changes: 15 additions & 7 deletions ElectronJS/src/taskGrid/FlowChart_CN.html
Original file line number Diff line number Diff line change
Expand Up @@ -513,25 +513,33 @@ <h4 class="modal-title">等价XPath</h4>
<h4 class="modal-title" id="myModalLabel">保存任务</h4>
<button type="button" class="close" data-dismiss="modal" aria-hidden="true">&times;</button>
</div>
<div class="modal-body">
<div class="modal-body" style="height:400px;overflow: auto">
<input onkeydown="inputDelete(event)" id="serviceId" type="hidden" name="serviceId" value="-1"></input>
<input onkeydown="inputDelete(event)" id="url" type="hidden" name="url" value="about:blank"></input>
<label>任务名称:</label>
<input onkeydown="inputDelete(event)" required name="serviceName" value="新web采集任务" id="serviceName" class="form-control"></input>
<label>任务描述:</label>
<input onkeydown="inputDelete(event)" id="serviceDescription" name="serviceDescription" class="form-control"></input>
<label>每采集多少条数据保存一次(值越大采集速度越快,但如果意外退出则有数据丢失风险):</label>
<input onkeydown="inputDelete(event)" type="number" value="10" id="saveThreshold" name="saveThreshold" class="form-control"></input>
<label>是否为cloudflare等极端反爬网站:</label>
<select id="cloudflare" name="cloudflare" class="form-control">
<option value = 0></option>
<option value = 1></option>
<label>导出数据格式:</label>
<select id="outputFormat" class="form-control">
<option value = "csv">CSV</option>
<option value = "xlsx">XLSX(EXCEL)</option>
<option value = "mysql">MySQL</option>
</select>
<label>浏览器模拟类型:</label>
<select id="environment" name="environment" class="form-control">
<option value = 0>电脑端</option>
<option value = 1>手机端(Cloudflare模式下不支持)</option>
</select>
<label>是否为cloudflare等极端反爬网站:</label>
<select id="cloudflare" name="cloudflare" class="form-control">
<option value = 0></option>
<option value = 1></option>
</select>
<label>每采集多少条数据保存一次(值越大采集速度越快,但如果意外退出则有数据丢失风险):</label>
<input onkeydown="inputDelete(event)" type="number" value="10" id="saveThreshold" name="saveThreshold" class="form-control"></input>
<label>控制台预览时数据最大显示长度:</label>
<input onkeydown="inputDelete(event)" type="number" value="15" id="maxViewLength" class="form-control"></input>

</div>
<div class="modal-footer">
Expand Down
16 changes: 16 additions & 0 deletions ElectronJS/src/taskGrid/logic_CN.js
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,8 @@ function saveService(type) {
"saveThreshold": saveThreshold,
"cloudflare": cloudflare,
"environment": environment,
"maxViewLength": parseInt($("#maxViewLength").val()),
"outputFormat": $("#outputFormat").val(),
"containJudge": containJudge,
"desc": serviceDescription,
"inputParameters": inputParameters,
Expand Down Expand Up @@ -460,10 +462,24 @@ if (sId != null && sId != -1) //加载任务
$.get(backEndAddressServiceWrapper + "/queryTask?id=" + sId, function(result) {
nodeList = result["graph"];
app.$data.list.nl = nodeList;
for(let node of nodeList){ //兼容旧版本
if(node["option"] == 1){
if(!("cookies" in node["parameters"])) {
node["parameters"]["cookies"] = "";
}
}
}
$("#serviceName").val(result["name"]);
$("#serviceId").val(result["id"]);
$("#url").val(result["url"]);
$("#serviceDescription").val(result["desc"]);
for(let key of Object.keys(result)){
try{
$("#"+key).val(result[key]);
} catch(e){
console.log(e);
}
}
refresh();
});
} else {
Expand Down
2 changes: 1 addition & 1 deletion ElectronJS/tasks/1.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion ElectronJS/tasks/112.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion ElectronJS/tasks/142.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"id":142,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/6/2023, 3:38:35 AM","version":"0.3.5","saveThreshold":10,"cloudflare":0,"environment":0,"containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"string","exampleValue":"/手机/数码"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":"test=123\nipLoc-djd=53283-53456-0-0\nareaId=53283\nmba_sid=16885856346417163685425076773.0\n__jdc=122270672\n__jdb=122270672.1.16885856346381587112207|1.1688585634\nmba_muid=16885856346381587112207\n__jdv=122270672%7Clocalhost%3A8074%7C-%7Creferral%7C-%7C1688585634639\n__jda=122270672.16885856346381587112207.1688585634.1688585634.1688585634.1"}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]","//div[contains(., '/手机/数码')]","//DIV[@class='LeftSide_menu_item__SBMWC LeftSide_text_space__2UhbG ']","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"/手机/数码"}],"unique_index":"p2h2i1dva8ljq4aje2","iframe":false,"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}
{"id":142,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/6/2023, 4:08:31 AM","version":"0.3.5","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"outputFormat":"xlsx","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"string","exampleValue":"/手机/数码"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":"test=123\nipLoc-djd=53283-53456-0-0\nareaId=53283\nmba_sid=16885856346417163685425076773.0\n__jdc=122270672\n__jdb=122270672.1.16885856346381587112207|1.1688585634\nmba_muid=16885856346381587112207\n__jdv=122270672%7Clocalhost%3A8074%7C-%7Creferral%7C-%7C1688585634639\n__jda=122270672.16885856346381587112207.1688585634.1688585634.1688585634.1"}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]","//div[contains(., '/手机/数码')]","//DIV[@class='LeftSide_menu_item__SBMWC LeftSide_text_space__2UhbG ']","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"/手机/数码"}],"unique_index":"p2h2i1dva8ljq4aje2","iframe":false,"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}
1 change: 1 addition & 0 deletions ElectronJS/tasks/143.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":143,"name":"中国知网","url":"https://chn.oversea.cnki.net/index/","links":"https://chn.oversea.cnki.net/index/","create_time":"7/6/2023, 4:50:52 AM","version":"0.3.5","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"outputFormat":"csv","containJudge":false,"desc":"https://chn.oversea.cnki.net/index/","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://chn.oversea.cnki.net/index/","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://chn.oversea.cnki.net/index/"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://chn.oversea.cnki.net/index/","links":"https://chn.oversea.cnki.net/index/","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}}]}
Loading

0 comments on commit f84eb67

Please sign in to comment.