Python抓取豆瓣电影Top250数据
#! /usr/bin/python # -*- coding:UTF-8 -*- from urllib import request import re import pymysql class MovieTop(object): def __init__(self): self.start = 0 self.param = '&filter' self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/65.0.3325.146 Safari/537.36"} self.movieList = [] self.filePath = './DoubanTop250.txt' def get_page(self): try: url = 'https://movie.douban.com/top250?start=' + str(self.start) + '&filter=' myRequest = request.Request(url, headers=self.headers) response = request.urlopen(myRequest) page = response.read().decode('utf-8') print('正在获取第' + str((self.start+25)//25) + '页数据...') self.start += 25 return page except request.URLError as e: if hasattr(e, 'reason'): print('获取失败,失败原因:', e.reason) def get_page_info(self): patern = re.compile(u'.*?' + u'.*?' + u'(.*?).*?' + u'.*?' + u'(.*?).*?' + u'(.*?).*?' + u'.*?' + u'.*?' + u'导演:\s(.*?)\s.*?
.*?' + u'
' + u'(.*?) / ' + u'(.*?) / (.*?).*?' + u'' + u'(.*?).*?' + u'(.*?)人评价.*?' + u'(.*?)' , re.S) while self.start < = 225: page = self.get_page() movies = re.findall(patern, page) for movie in movies: self.movieList.append([movie[0], movie[1], movie[2].lstrip(' / '), movie[3], movie[4].lstrip(), movie[5], movie[6].rstrip(), movie[7], movie[8], movie[9]]) def write_page(self): file = open(self.filePath, 'w', encoding='utf-8') try: for movie in self.movieList: file.write('电影排名:' + movie[0] + '\n') file.write('电影名称:' + movie[1] + '\n') file.write('电影别名:' + movie[2] + '\n') file.write('导演:' + movie[3] + '\n') file.write('上映年份:' + movie[4] + '\n') file.write('制作国家/地区:' + movie[5] + '\n') file.write('电影类别:' + movie[6] + '\n') file.write('评分:' + movie[7] + '\n') file.write('参评人数:' + movie[8] + '\n') file.write('简短影评:' + movie[9] + '\n') file.write('\n') print('成功写入文件,共有%d条记录...' % len(self.movieList)) except Exception as e: print(e) finally: file.close() def upload(self): db = pymysql.connect("localhost", "root", "love1125", "PythonTest", charset='utf8') cursor = db.cursor() cursor.execute('DELETE FROM doubanTop250') insertStr = "INSERT INTO doubanTop250(rank, name, alias, director," \ "showYear, makeCountry, movieType, movieScore, scoreNum, shortFilm)" \ "VALUES (%d, '%s', '%s', '%s', '%s', '%s', '%s', %f, %d, '%s')" try: for movie in self.movieList: insertSQL = insertStr % (int(movie[0]), str(movie[1]), str(movie[2]), str(movie[3]), str(movie[4]), str(movie[5]), str(movie[6]), float(movie[7]), int(movie[8]), str(movie[9])) cursor.execute(insertSQL) db.commit() print('成功上传至数据库...') except Exception as e: print(e) db.rollback() finally: db.close() def main(self): print('开始抓取豆瓣电影TOP250...') self.get_page_info() print('成功获取豆瓣电影TOP250...') print('开始写入文件...') self.write_page() print('开始上传至数据库...') self.upload() douban = MovieTop() douban.main()