#!/usr/bin/env python# -*- encoding: utf-8 -*-
# Created on 2017-11-30 15:46:23
# Project: ttwanda_3
from pyspider.libs.base_handler import *
import re
import json
from pyspider.libs.utils import md5string
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('http://www.ttwanda.com', callback=self.index_page)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a[href^="http"]').items():
if re.match(u"http://www.ttwanda.com/film/page/\d+|http://www.ttwanda.com/film$", each.attr.href):
self.result={}
self.crawl(each.attr.href, callback=self.film_list_page, save=self.result)
def film_list_page(self, response):
for each in response.doc('article.u-movie').items():
self.result = response.save
self.result['poster'] = each('img').attr['data-original']
self.result['star'] = each('.pingfen').text()
self.crawl(each('.list-poster a[href^="http"]').attr.href, callback=self.film_detail_page, save=self.result,priority=1)
self.crawl(response.doc('.next-page a').attr('href'), callback=self.index_page)
def film_detail_page(self, response):
self.result = response.save
for each in response.doc('.mplay-list a').items():
self.crawl(each.attr.href, callback=self.film_video_page, save=self.result)
def film_video_page(self, response):
self.result = response.save
self.result['title'] = self.response.doc('.player_box>strong').text()
self.result['url'] = self.response.url
#print(self.get_taskid(self.task))
for each in response.doc('script').items():
self.search = re.search(r'var play_type="(\w+)",vid="(\w+)";',each.text())
if self.search:
self.result['vtype'] = self.search.group(1)
self.result['vid'] = self.search.group(2)
return self.result
评论 (0)