From 2344ee1db07e9095e00153fa41280cfea436c8a4 Mon Sep 17 00:00:00 2001 From: Shayne Wang <1614565666@qq.com> Date: Fri, 1 Dec 2023 19:05:08 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=B8=A4=E4=B8=AA=E4=BB=A3?= =?UTF-8?q?=E7=90=86=20(#203)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- proxypool/crawlers/public/docip.py | 38 +++++++++++++++++++++ proxypool/crawlers/public/uqidata.py | 49 ++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 proxypool/crawlers/public/docip.py create mode 100644 proxypool/crawlers/public/uqidata.py diff --git a/proxypool/crawlers/public/docip.py b/proxypool/crawlers/public/docip.py new file mode 100644 index 0000000..070c598 --- /dev/null +++ b/proxypool/crawlers/public/docip.py @@ -0,0 +1,38 @@ +import time +from retrying import RetryError +from loguru import logger +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +import json + +BASE_URL = 'https://www.docip.net/data/free.json?t={date}' + + + +class DocipCrawler(BaseCrawler): + """ + Docip crawler, https://www.docip.net/data/free.json + """ + urls = [BASE_URL.format(date=time.strftime("%Y%m%d", time.localtime()))] + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + try: + result = json.loads(html) + proxy_list = result['data'] + for proxy_item in proxy_list: + host = proxy_item['ip'] + port = proxy_item['port'] + yield Proxy(host=host, port=port) + except json.JSONDecodeError: + print("json.JSONDecodeError") + return + + +if __name__ == '__main__': + crawler = DocipCrawler() + for proxy in crawler.crawl(): + print(proxy) diff --git a/proxypool/crawlers/public/uqidata.py b/proxypool/crawlers/public/uqidata.py new file mode 100644 index 0000000..3e54b2d --- /dev/null +++ b/proxypool/crawlers/public/uqidata.py @@ -0,0 +1,49 @@ +from pyquery import PyQuery as pq +from proxypool.schemas.proxy import Proxy +from proxypool.crawlers.base import BaseCrawler +from loguru import logger + +BASE_URL = 'https://ip.uqidata.com/free/index.html' + + +class UqidataCrawler(BaseCrawler): + """ + Uqidata crawler, https://ip.uqidata.com/free/index.html + """ + urls = [BASE_URL] + ignore = True + + def encode(input_str): + tmp = [] + for i in range(len(input_str)): + tmp.append("ABCDEFGHIZ".find(input_str[i])) + result = "".join(str(i) for i in tmp) + result = int(result) >> 0x03 + return result + + def parse(self, html): + """ + parse html file to get proxies + :return: + """ + doc = pq(html) + trs = doc('#main_container .inner table tbody tr:nth-child(n+3)').items() + for tr in trs: + ip_html = tr('td.ip').find("*").items() + host = '' + for i in ip_html: + if i.attr('style') is not None and 'none' in i.attr('style'): + continue + if i.text() == '': + continue + host += i.text() + + port_code = tr('td.port').attr('class').split(' ')[1] + port = UqidataCrawler.encode(port_code) + yield Proxy(host=host, port=port) + + +if __name__ == '__main__': + crawler = UqidataCrawler() + for proxy in crawler.crawl(): + print(proxy)