import requests
import time
import re
import csv
import random
from fake_useragent import UserAgent
f = open('瓜子二手车.csv', 'w', encoding='utf-8', newline='')
write = csv.writer(f)
write.writerow(['车型', '上牌时间', '表显里程', '价格/万'])
class carSpider:
def __init__(self):
self.url = 'http://www.guazi.com/bj/buy/o{}r8'
def get_url(self, choose):
urls = []
for page in range(1, choose + 1):
headers = {"User-Agent": UserAgent().random}
time.sleep(random.randint(1, 2))
page_url = self.url.format(page)
urls.append(page_url)
return urls
def save_one_car_info(self, urls):
for url in urls:
headers = {"User-Agent": UserAgent().random}
time.sleep(random.randint(1, 2))
html = requests.get(url, headers=headers).content.decode('utf-8')
car_infos = re.findall('<li data-scroll-track=.*?<a title="(.*?)".*?<div class="t-i">(.*?)<span class="icon-pad">.*?</span>(.*?)<span.*?>.*?<p>(.*?)<span>万</span>.*?</p>', html, re.S)
for car_info in car_infos:
print(car_info)
car_name = car_info[0]
car_year = car_info[1]
car_km = car_info[2]
car_money = car_info[3]
write.writerow([car_name, car_year, car_km, car_money])
def run_main(self):
choose = int(input('爬取到第几页(从第一页开始): '))
urls = self.get_url(choose)
self.save_one_car_info(urls)
if __name__ == '__main__':
CarSpider = carSpider()
CarSpider.run_main()
f.close()