1def parse_question(self, response):
2 # ??question??? ??????????question item
3 if "QuestionHeader-title" in response.text:
4 # ?????
5 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
6 if match_obj:
7 question_id = int(match_obj.group(2))
8
9 item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
10 item_loader.add_css("title", "h1.QuestionHeader-title::text")
11 item_loader.add_css("content", ".QuestionHeader-detail")
12 item_loader.add_value("url", response.url)
13 item_loader.add_value("zhihu_id", question_id)
14 item_loader.add_css("answer_num", ".List-headerText span::text")
15 item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
16 item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
17 item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")
18
19 question_item = item_loader.load_item()
20 else:
21 # ????????item??
22 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
23
24 if match_obj:
25 question_id = int(match_obj.group(2))
26
27 item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
28 # item_loader.add_css("title", ".zh-question-title h2 a::text")
29 item_loader.add_xpath("title",
30 "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
31 item_loader.add_css("content", "#zh-question-detail")
32 item_loader.add_value("url", response.url)
33 item_loader.add_value("zhihu_id", question_id)
34 item_loader.add_css("answer_num", "#zh-question-answer-num::text")
35 item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
36 # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
37 item_loader.add_xpath("watch_user_num",
38 "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
39 item_loader.add_css("topics", ".zm-tag-editor-labels a::text")
40
41 question_item = item_loader.load_item()
42
43 yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,
44 callback=self.parse_answer)
45 yield question_item
1def parse(self, response):
2 l = ItemLoader(item=PlantItem(), response=response)
3
4 l.add_xpath('name', "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/h2/text()")
5 l.add_xpath('species', "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/div[@class='clear resultSpecies']/text()")
6 l.add_xpath('key', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-key']/text()")
7 l.add_xpath('value', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/child::node()")
8 # l.add_xpath('value', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/a/text()")
9
10 return l.load_item()
1def parse_song_list(self, response):
2 selector = Selector(response)
3
4 song_name_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/text()').extract()
5 song_id_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/@href').extract()
6 title = selector.xpath('//title/text()').extract()
7 for index, id_ in enumerate(song_id_list):
8 l = ItemLoader(item=PlayListItem())
9 l.add_value('song_name', song_name_list[index])
10 l.add_value('title', title)
11 yield scrapy.FormRequest(url=self.BASE_URL + id_, meta={'song_id': id_[9:], 'loader': l}, method='GET',
12 headers=self.headers, callback=self.parse_single_song)
1def parse(self, response):
2
3 for outer in response.css('#comapreTable tr:not(:first-child)'):
4
5 if outer.css('td[align="center"]'):
6 ccode = outer.css('td[align="center"]>a::attr(id)').extract_first()
7 cname = outer.css('td[align="center"]>a::text').extract_first()
8
9 for inner in outer.xpath('td[div[@align="left"]/a]'):
10 loader = ItemLoader(item=EolZhuanyeItem(), selector=inner)
11 loader.add_value('ccode', ccode)
12 loader.add_value('cname', cname)
13 loader.add_css('url', 'a::attr(href)', lambda urls: urljoin(self.start_urls[0], urls[0]))
14 loader.add_xpath('code', 'following-sibling::td[1]/text()', MapCompose(unicode.strip))
15 loader.add_css('name', 'a::text', MapCompose(unicode.strip))
16 item = loader.load_item()
17
18 yield Request(url=item['url'][0], meta={'item': item}, callback=self.parse_item)
1def parse_news(self, response):
2 self.logger.info('parse_news: %s' % response)
3
4 # Initialize item loader
5 # extract news title, published_at, author, content, url
6 loader = ItemLoader(item=News(), response=response)
7 loader.add_value('url', response.url)
8
9 title_selectors = response.css('h1[itemprop="headline"]::text')
10 if not title_selectors:
11 # Will be dropped on the item pipeline
12 return loader.load_item()
13 title = title_selectors.extract()[0]
14 loader.add_value('title', title)
15
16
17 author_name_selectors = response.css('a[rel="author"] > span::text')
18 if not author_name_selectors:
19 loader.add_value('author_name', '')
20 else:
21 author_name = author_name_selectors.extract()[0]
22 loader.add_value('author_name', author_name)
23
24 raw_content_selectors = response.css('.content')
25 if not raw_content_selectors:
26 # Will be dropped on the item pipeline
27 return loader.load_item()
28 raw_content = raw_content_selectors.extract()
29 raw_content = ' '.join([w.strip() for w in raw_content])
30 raw_content = raw_content.strip()
31 loader.add_value('raw_content', raw_content)
32
33 date_time_str_selectors = response.css('article > div.time::text')
34 if not date_time_str_selectors:
35 # Will be dropped on the item pipeline
36 return loader.load_item()
37
38 # Parse date information
39 # Example: Selasa, 6 Oktober 2015 - 05:23 WIB
40 date_time_str = date_time_str_selectors.extract()[0]
41 date_time_str = date_time_str.split(',')[1].strip()[:-4]
42 date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')])
43 try:
44 published_at_wib = datetime.strptime(date_time_str, '%d %B %Y - %H:%M')
45 except ValueError:
46 # Will be dropped on the item pipeline
47 return loader.load_item()
48 published_at = wib_to_utc(published_at_wib)
49 loader.add_value('published_at', published_at)
50
51 # Move scraped news to pipeline
52 return loader.load_item()
1def parse_item(self, response):
2 """
3 Extract fields from the individual email page and load them into the
4 item.
5
6 @url http://lkml.iu.edu/hypermail/linux/kernel/0111.3/0036.html
7 @returns items 1 1
8 @scrapes senderName senderEmail timeSent timeReceived subject body
9 @scrapes replyto url
10 """
11
12 load = ItemLoader(item=Email(), selector=response)
13
14 # Take care of easy fields first
15 load.add_value('url', response.url)
16
17 pattern_replyto = '//ul[1]/li[contains((b|strong), "In reply to:")]'
18 pattern_replyto += '/a/@href'
19 link = response.xpath(pattern_replyto).extract()
20 link = [''] if not link else link
21
22 load.add_value('replyto', link[0])
23
24 # Sometime in 2003, the archive changes and the email pages
25 # require specific procedure to extract the following fields:
26 specific_fields = {
27 'senderName': None,
28 'senderEmail': None,
29 'timeSent': None,
30 'timeReceived': None,
31 'subject': None
32 }
33
34 # Detect new archive system with HTML comment
35 new_system = response.xpath('/comment()[1][contains(., "MHonArc")]')
36
37 if len(new_system) >= 1:
38 # If new archive system is detected...
39 specific_fields = self.parse_new_system(response, specific_fields)
40 body_before_comment = '<!--X-Body-of-Message-->'
41 body_after_comment = '<!--X-Body-of-Message-End-->'
42 else:
43 # Otherwise...
44 specific_fields = self.parse_old_system(response, specific_fields)
45 body_before_comment = '<!-- body="start" -->'
46 body_after_comment = '<!-- body="end" -->'
47
48 # Load all the values from these specific fields
49 for key, val in specific_fields.items():
50 load.add_value(key, val)
51
52 if self.get_body:
53 # Final field, the body of the email
54 pattern_body = body_before_comment + '\n?(.*)' + body_after_comment
55
56 # Ignore invalid bytes when necessary
57 page_body = response.body.decode('utf-8', 'ignore')
58 body = re.search(pattern_body, page_body, flags=re.S)
59 load.add_value('body', body.group(1))
60
61 return load.load_item()
1def parse_question(self, response):
2 # ??question????????????question item
3 question_id = response.meta.get("zhihu_id", "")
4 item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
5 item_loader.add_css("title", "h1.QuestionHeader-title::text")
6 item_loader.add_css("content", ".QuestionHeader-detail")
7 item_loader.add_value("url", response.url)
8 item_loader.add_value("zhihu_id", question_id)
9 item_loader.add_css("answer_num", ".List-headerText span::text")
10 item_loader.add_css("comments_num", ".QuestionHeader-actions button::text")
11 item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
12 item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")
13
14 question_item = item_loader.load_item()
15
16 yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer)
17 yield question_item
1def parse_question(self, response):
2 question_pattern = re.compile('(.*zhihu.com/question/(\d+))(/|$).*')
3 match_object = re.match(question_pattern, response.url)
4 question_id = match_object.group(2)
5 item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
6 item_loader.add_value('zhihu_id', question_id)
7 item_loader.add_css('title', 'h1.QuestionHeader-title::text')
8 item_loader.add_css('topics', '.TopicLink .Popover div::text')
9 item_loader.add_value('url', response.url)
10 item_loader.add_css('content', '.QuestionHeader-detail div div span::text')
11 item_loader.add_css('answer_num', '.List-headerText span::text')
12 item_loader.add_css('comments_num', '.QuestionHeader-Comment button::text')
13 item_loader.add_css('watch_user_num', '.NumberBoard-value::text')
14
15 item = item_loader.load_item()
16 yield item
17 yield scrapy.Request(self.start_answer_url.format(question_id=question_id, offset=0, limit=20),
18 headers=self.headers, callback=self.parse_answer)
1def parse_item(self, response):
2
3 loader = ItemLoader(ChsiDaxueItem(), response)
4 loader.add_value('id', response.url, re=ur'schId-(\w+)\.dhtml')
5 loader.add_value('url', response.url)
6 loader.add_css('logo', u'.r_c_sch_logo>img::attr(src)', MapCompose(lambda url: urljoin('http://gaokao.chsi.com.cn/', url)))
7 loader.add_css('name', u'.topImg::text')
8 loader.add_css('badges', u'.r_c_sch_attr .r_c_sch_icon::attr(title)')
9
10 data_clean = MapCompose(lambda x: re.sub(r'\s+', ' ', x), unicode.strip)
11 loader.add_xpath('type', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
12 loader.add_xpath('membership', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
13 loader.add_xpath('province', u'//span[@class="f_bold" and span]/following-sibling::text()', data_clean)
14 loader.add_xpath('address', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
15 loader.add_xpath('phone', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
16 loader.add_xpath('website', u'//span[@class="f_bold" and .="?????"]/following-sibling::a/@href', data_clean)
17 loader.add_xpath('backdoor', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
18
19 def parse_votes():
20 xpath = u'//td[@class="tdMydT" and .="{}"]/following-sibling::td/div[@class="rank"]/@rank'
21 get_vote = lambda what: float(response.xpath(xpath.format(what)).extract_first() or 0)
22 return {
23 'overall': get_vote(u'?????'),
24 'environment': get_vote(u'???????'),
25 'life': get_vote(u'?????'),
26 }
27
28 loader.add_value('votes', parse_votes())
29
30 def parse_trending():
31 css = u'{}>table tr:not(:first-child)'
32 def get_trending(what):
33 majors = []
34 for e in response.css(css.format(what)):
35 majors.append({
36 'id': e.css(u'.tdZytjTDiv>a::attr(href)').re_first(r'specId=(\w+)'),
37 'name': e.css(u'.tdZytjTDiv::attr(title)').extract_first(),
38 'vote': float(e.css(u'.avg_rank::text').extract_first()),
39 'count': int(e.css(u'.c_f00::text, .red::text').extract_first()),
40 })
41 return majors
42 return {
43 'count': get_trending(u'#topNoofPTable'),
44 'index': get_trending(u'#topIndexTable'),
45 'like': get_trending(u'.r_r_box_zymyd'),
46 }
47
48 loader.add_value('trending', parse_trending())
49
50 item = loader.load_item()
51
52 for link in LinkExtractor(restrict_xpaths=u'//ul[@id="topNav"]//a[.="????"]').extract_links(response):
53 yield Request(link.url, meta={'item': item}, callback=self.parse_jianjie)
1def parse_first_page(self, response):
2 count = int(response.xpath('//ul[@class="image"]/text()')[0].re(r'.*?(\d+).*?')[0])
3 title = response.request.cookies['title']
4 albumURL = response.url.replace(".shtml", '')
5 # print u'', count, title, albumURL
6 for x in xrange(1,count+1):
7 suffix = ".shtml"
8 if x > 1:
9 suffix = "_"+str(x)+".shtml"
10 # print u'',albumURL+suffix
11 request = scrapy.Request(albumURL+suffix, callback=self.parse_item, cookies={'title': title})
12 yield request
13
14 l = ItemLoader(item=PageItem(), response=response)
15 l.add_value('title', title)
16 l.add_value('name', self.name)
17 l.add_value('url', response.url)
18 l.add_xpath('image_urls', '//td[@valign="top"]/img/@src')
19 yield l.load_item()