Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tayio_AI Web Scrapping Assignment(Data Engineer) --Anurag Krishnan #74

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
395 changes: 395 additions & 0 deletions .ipynb_checkpoints/Untitled-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,395 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "1e694b2f",
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"import requests \n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "17a29900",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<Response [403]>\n"
]
}
],
"source": [
"req = requests.get('https://www.chinabidding.com/en')\n",
"print(req)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "71d046bd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<!DOCTYPE html>\n",
"<html>\n",
"<head>\n",
"<meta charset=\"utf-8\"/>\n",
"<style>\n",
"body{ background:#fff; font-family: microsoft yahei; color:#969696; font-size:14px;}\n",
".online-desc-con { text-align:center; }\n",
".r-tip01 { color: #333; font-size: 18px; display: block; text-align: center; width: 600px; padding: 0 10px; overflow: hidden; text-overflow: ellipsis; margin: 0 auto 15px; }\n",
".r-tip02 { color: #585858; font-size: 14px; display: block; margin-top: 20px; margin-bottom: 20px; }\n",
"#notice-jiasule {\n",
" word-wrap: break-word;\n",
" word-break: normal;\n",
" color:#585858;\n",
" border:1px solid #ddd;\n",
" padding:0px 20px 0px 20px\n",
"}\n",
"img { border: 0; }\n",
".u-ico{ vertical-align: middle; margin-right: 12px;}\n",
".btn{ padding: 8px 22px; border-radius: 3px; border: 0; display: inline-block;vertical-align: middle;text-decoration: none;}\n",
".btn-g{ background-color: #61b25e; color: #fff;}\n",
".report {color: #858585; text-decoration: none;}\n",
".report:hover {text-decoration: underline; color: #0088CC;}\n",
"hr{ border-top: 1px dashed #ddd;}\n",
"center{ line-height: 48px; color: #919191;}\n",
".b-box {background: #0B6FD6;margin: -8px -8px 0 -8px;height: 32px;}\n",
".b-body {position: relative;left:50%;max-width: 1200px;transform: translateX(-50%);padding-top: 5px;white-space:nowrap;}\n",
".b-ico-box {display: inline-block;margin-right: 4px;}\n",
".b-ico {height: 18px;width: 18px;}\n",
".b-msg-box {display: inline-block;font-size: 14px;color: #fff;width: 96%;white-space:nowrap;overflow:hidden;}\n",
".b-msg {display: inline-block;font-size: 14px;color: #fff;padding-right: 20px;}\n",
"#second {font-style: normal;}\n",
".dn {display: none;}\n",
"</style>\n",
"</head>\n",
"<body>\n",
"<div class=\"b-box\"><div class=\"b-body\"><div class=\"b-ico-box\"><img alt=\"\" class=\"b-ico\" src=\"/cdn-cgi/image/alt.svg\"/></div><div class=\"b-msg-box\" id=\"msg-box\"><a class=\"b-msg\" href=\"#\" id=\"flink\"></a><a class=\"b-msg\" href=\"#\" id=\"slink\"></a></div></div></div>\n",
"<div class=\"online-desc-con\" style=\"width:640px;padding-top:15px;margin:34px auto;\">\n",
"<img alt=\"\" id=\"wafblock\" style=\"margin: 0 auto 17px auto;\"/>\n",
"<div id=\"content_rendered\">\n",
"<span class=\"r-tip01\" id=\"error_403\"></span>\n",
"<span class=\"r-tip01\" id=\"error_403_en\"></span>\n",
"<div id=\"notice-jiasule\">\n",
"<p>当前网址:<span id=\"url\"></span></p>\n",
"<p>客户端特征:<span id=\"user_agent\"></span></p>\n",
"<p>拦截时间:<span id=\"now\"></span>  本次事件ID <span id=\"rule_id\"></span></p>\n",
"</div>\n",
"<span class=\"r-tip02\">\n",
"<img alt=\"\" class=\"u-ico\" src=\"/cdn-cgi/image/guest.png\"/>如果您是网站管理员,请登录知道创宇云防御 \n",
" <a class=\"btn btn-g\" href=\"#\" id=\"detail-link\" target=\"_blank\">查看详情</a>\n",
"  或者 \n",
" <a class=\"report\" href=\"#\" id=\"report-link\" target=\"_blank\">反馈误报</a>\n",
"</span>\n",
"<span class=\"r-tip02 dn\" id=\"second-box\"><em id=\"second\"></em>秒后进入帮助页面</span>\n",
"</div>\n",
"<hr/>\n",
"<center>client: <span id=\"client_ip\"></span>, server: cf54e48, time: <span id=\"time_error\"></span></center>\n",
"</div>\n",
"<script>\n",
" void(function fuckie6(){if(location.hash && /MSIE 6/.test(navigator.userAgent) && !/jsl_sec/.test(location.href)){location.href = location.href.split('#')[0] + '&jsl_sec' + location.hash}})();\n",
" var data = {\"rule_id\":\"80001\",\"t_msg\":\"全球领先的IT市场研究和咨询公司IDC发布最新研究报告《中国云Web应用防火墙市场份额,2022:云上云下协同发展,云原生WAF成为必然》知道创宇【创宇盾】产品凭借过硬的技术实力及成熟的市场表现,在中国整体云WAF市场及公有云WAF市场中,均居专业安全厂商第一!\",\"error_403_type\":\"\",\"error_403_en\":\"Knownsec CloudWAF: Your request has been blocked due to suspected hacking\",\"t_link\":\"https:\\/\\/mp.weixin.qq.com\\/s\\/o0ZGlxEQ1786abnXygyTqQ\",\"error_403\":\"\",\"client_ip\":\"49.36.110.35\",\"time_error\":\"15\\/Oct\\/2023:15:43:53 +0800\"};\n",
" var from = encodeURIComponent(document.referrer.substr(0, 1024));\n",
" var rule_id = parseInt(data['rule_id']) || '';\n",
" var client_ip = data['client_ip'];\n",
" var ref = encodeURIComponent(document.URL.substr(0, 1024));\n",
" document.getElementById(\"wafblock\").src = '/cdn-cgi/image/' + (data['error_403_type'] || 'hacker') + '.png';\n",
" document.getElementById('error_403').innerText = data['error_403'] || '创宇盾提示您:当前访问疑似黑客攻击,已被网站管理员设置为拦截';\n",
" document.getElementById('error_403_en').innerText = data['error_403_en'];\n",
" document.getElementById('url').innerText = document.URL.replace(/\\</g,\"%3C\").replace(/\\>/g,\"%3E\");\n",
" document.getElementById('user_agent').innerText = navigator.userAgent;\n",
" document.getElementById('now').innerText = new Date(new Date() - -8 * 3600000).toISOString().substr(0, 19).replace('T', ' ');\n",
" document.getElementById('rule_id').innerText = rule_id;\n",
" document.getElementById('client_ip').innerText = client_ip;\n",
" document.getElementById('time_error').innerText = data['time_error'];\n",
" document.getElementById('detail-link').href = 'http://help.yunaq.com/feedback.html?from=' + from + '&rule_id=' + rule_id + '&client_ip=' + client_ip + '&referrer=' + ref + '#pus';\n",
" document.getElementById('report-link').href = 'http://help.yunaq.com/feedback.html?from=' + from + '&rule_id=' + rule_id + '&client_ip=' + client_ip + '&referrer=' + ref + '#hus';\n",
" if (data['error_403_type'] === 'frequency_high' || !data['error_403_type']) {\n",
" var sbox = document.getElementById('second-box');\n",
" sbox.classList.remove('dn');\n",
" var second = 2;\n",
" var ele = document.getElementById('second');\n",
" ele.innerText = second;\n",
" var link_url;\n",
" if (data['error_403_type'] === 'frequency_high') {\n",
" link_url = \"https://help.yunaq.com/limit_rate.html\"\n",
" } else if (!data['error_403_type']) {\n",
" link_url = \"https://help.yunaq.com/waf.html\";\n",
" }\n",
" var eid = setInterval(function() {\n",
" second--;\n",
" ele.innerText = second;\n",
" if (second == 0) {\n",
" clearInterval(eid)\n",
" sbox.outerHTML = \"\";\n",
" window.location.href = link_url;\n",
" }\n",
" }, 1000);\n",
" document.getElementById('detail-link').addEventListener(\"click\", function() {\n",
" clearInterval(eid);\n",
" sbox.outerHTML = \"\";\n",
" });\n",
" document.getElementById('report-link').addEventListener(\"click\", function() {\n",
" clearInterval(eid);\n",
" sbox.outerHTML = \"\";\n",
" });\n",
" }\n",
" function getOffsetLimit(msgW, boxW, pad) {var limit = init = 0;if (msgW+pad>boxW){limit = msgW+pad;if (msgW + pad - boxW > 20){init = pad}}else{limit = (msgW+pad)*2;init = -boxW}return {limit: limit, init: init}};\n",
" var fst = document.getElementById('flink');\n",
" var last = document.getElementById('slink');\n",
" fst.innerText = last.innerText = data['t_msg'];\n",
" fst.href = last.href = data['t_link'];\n",
" var msgW = fst.getBoundingClientRect().width;\n",
" var msgBox = document.getElementById('msg-box');\n",
" var boxW = msgBox.getBoundingClientRect().width;\n",
" var run = getOffsetLimit(msgW, boxW, 20);\n",
" var ofst = 0;\n",
" function runner() {ofst = ofst >= run.limit ? run.init : (ofst + 2);fst.style.transform = 'translateX('+(-ofst)+'px)';last.style.transform = 'translateX('+(-ofst)+'px)';};\n",
" setInterval(runner, 100);\n",
"</script>\n",
"</body>\n",
"</html>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"soup = BeautifulSoup(req.content)\n",
"soup"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "614b4304",
"metadata": {},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "invalid syntax (Temp/ipykernel_5308/1252400516.py, line 2)",
"output_type": "error",
"traceback": [
"\u001b[1;36m File \u001b[1;32m\"C:\\Users\\anura\\AppData\\Local\\Temp/ipykernel_5308/1252400516.py\"\u001b[1;36m, line \u001b[1;32m2\u001b[0m\n\u001b[1;33m for i in soup.find_all('a',class = \"item-title-text bold fs18\"):\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
]
}
],
"source": [
"tenders_headings = []\n",
"for i in soup.find_all('a',class_= \"item-title-text bold fs18\"):\n",
" tenders_headings.append(i.text)\n",
"tenders_headings"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "caafc3b4",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "9f22d64a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "486a6d63",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "cb1582bb",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "669ed5df",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "2fd95ce9",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "19fedcd2",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "bc03a718",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "0e5e3f37",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "77814f1b",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "cb3264c7",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "f784ebf7",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "be13148e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "e7999007",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "61d3a533",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "b266fe0c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "437051df",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "0c66295a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "4fcf867a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1712323",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "4b442596",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "5cd96d87",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading