-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathavito.py
121 lines (88 loc) · 2.92 KB
/
avito.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# https://chrome.google.com/webstore/detail/export-historybookmarks-t/dcoegfodcnjofhjfbhegcgjgapeichlf?hl=en
import requests
from bs4 import BeautifulSoup
import csv
import os
import bot
from tkinter import *
from tkinter import messagebox as mb
def get_html(url):
r = requests.get(url)
return r.text
def get_total_pages(html):
soup = BeautifulSoup(html, 'lxml')
pages = soup.find('div', class_='pagination-pages').find_all(
'a', class_='pagination-page')[-1].get('href')
total_pages = pages.split('=')[1].split('&')[0]
return int(total_pages)
# find('div', )
# find_all
def write_csv(data):
with open('avito.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(
(data['title'], data['price'], data['metro'], data['url'], data['number']))
def get_page_data(html):
soup = BeautifulSoup(html, 'lxml')
ads = soup.find('div', class_='catalog-list').find_all('div',
class_='item_table')
for ad in ads:
name = ad.find('div',
class_='description').find('h3').text.strip().lower()
try:
title = ad.find('div',
class_='description').find('h3').text.strip()
except:
title = ''
try:
url = 'https://www.avito.ru' + ad.find(
'div', class_='description').find('h3').find('a').get('href')
except:
url = ''
try:
price = ad.find('div', class_='about').text.strip()
except:
price = ''
try:
metro = ad.find('div',
class_='data').find_all('p')[-1].text.strip()
except:
metro = ''
try:
number = bot.Bot().parse(url)
print(number)
except:
number = ''
data = {
'title': title,
'price': price,
'metro': metro,
'url': url
# 'number': number
}
write_csv(data)
def main():
def check():
url = entry.get()
entry.delete(0, END)
base_url = url.split('?')[0] + '?'
page_part = 'p='
query_part = '&' + url.split('?')[1]
total_pages = get_total_pages(get_html(url))
for i in range(1, total_pages):
url_gen = base_url + page_part + str(i) + query_part
html = get_html(url_gen)
get_page_data(html)
root = Tk()
root.minsize(500, 200)
entry = Entry(width=80)
entry.pack(pady=20)
Button(text='Парсить', command=check).pack()
root.mainloop()
# url = 'https://www.avito.ru/krasnodarskiy_kray/ptitsy?q=%D0%BA%D1%83%D1%80%D0%B8%D1%86%D1%8B'
if __name__ == '__main__':
if os.path.exists('avito.csv'):
os.remove("avito.csv")
main()