-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
0a6d861
commit 2344ee1
Showing
2 changed files
with
87 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import time | ||
from retrying import RetryError | ||
from loguru import logger | ||
from proxypool.schemas.proxy import Proxy | ||
from proxypool.crawlers.base import BaseCrawler | ||
import json | ||
|
||
BASE_URL = 'https://www.docip.net/data/free.json?t={date}' | ||
|
||
|
||
|
||
class DocipCrawler(BaseCrawler): | ||
""" | ||
Docip crawler, https://www.docip.net/data/free.json | ||
""" | ||
urls = [BASE_URL.format(date=time.strftime("%Y%m%d", time.localtime()))] | ||
|
||
def parse(self, html): | ||
""" | ||
parse html file to get proxies | ||
:return: | ||
""" | ||
try: | ||
result = json.loads(html) | ||
proxy_list = result['data'] | ||
for proxy_item in proxy_list: | ||
host = proxy_item['ip'] | ||
port = proxy_item['port'] | ||
yield Proxy(host=host, port=port) | ||
except json.JSONDecodeError: | ||
print("json.JSONDecodeError") | ||
return | ||
|
||
|
||
if __name__ == '__main__': | ||
crawler = DocipCrawler() | ||
for proxy in crawler.crawl(): | ||
print(proxy) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
from pyquery import PyQuery as pq | ||
from proxypool.schemas.proxy import Proxy | ||
from proxypool.crawlers.base import BaseCrawler | ||
from loguru import logger | ||
|
||
BASE_URL = 'https://ip.uqidata.com/free/index.html' | ||
|
||
|
||
class UqidataCrawler(BaseCrawler): | ||
""" | ||
Uqidata crawler, https://ip.uqidata.com/free/index.html | ||
""" | ||
urls = [BASE_URL] | ||
ignore = True | ||
|
||
def encode(input_str): | ||
tmp = [] | ||
for i in range(len(input_str)): | ||
tmp.append("ABCDEFGHIZ".find(input_str[i])) | ||
result = "".join(str(i) for i in tmp) | ||
result = int(result) >> 0x03 | ||
return result | ||
|
||
def parse(self, html): | ||
""" | ||
parse html file to get proxies | ||
:return: | ||
""" | ||
doc = pq(html) | ||
trs = doc('#main_container .inner table tbody tr:nth-child(n+3)').items() | ||
for tr in trs: | ||
ip_html = tr('td.ip').find("*").items() | ||
host = '' | ||
for i in ip_html: | ||
if i.attr('style') is not None and 'none' in i.attr('style'): | ||
continue | ||
if i.text() == '': | ||
continue | ||
host += i.text() | ||
|
||
port_code = tr('td.port').attr('class').split(' ')[1] | ||
port = UqidataCrawler.encode(port_code) | ||
yield Proxy(host=host, port=port) | ||
|
||
|
||
if __name__ == '__main__': | ||
crawler = UqidataCrawler() | ||
for proxy in crawler.crawl(): | ||
print(proxy) |