scrapy itemloader example

Solutions on MaxInterview for scrapy itemloader example by the best coders in the world

showing results for - "scrapy itemloader example"

29 Apr 2020

1def parse_question(self, response):
2        # ??question??? ??????????question item
3        if "QuestionHeader-title" in response.text:
4            # ?????
5            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
6            if match_obj:
7                question_id = int(match_obj.group(2))
8
9            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
10            item_loader.add_css("title", "h1.QuestionHeader-title::text")
11            item_loader.add_css("content", ".QuestionHeader-detail")
12            item_loader.add_value("url", response.url)
13            item_loader.add_value("zhihu_id", question_id)
14            item_loader.add_css("answer_num", ".List-headerText span::text")
15            item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
16            item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
17            item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")
18
19            question_item = item_loader.load_item()
20        else:
21            # ????????item??
22            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
23
24            if match_obj:
25                question_id = int(match_obj.group(2))
26
27            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
28            # item_loader.add_css("title", ".zh-question-title h2 a::text")
29            item_loader.add_xpath("title",
30                                  "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
31            item_loader.add_css("content", "#zh-question-detail")
32            item_loader.add_value("url", response.url)
33            item_loader.add_value("zhihu_id", question_id)
34            item_loader.add_css("answer_num", "#zh-question-answer-num::text")
35            item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
36            # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
37            item_loader.add_xpath("watch_user_num",
38                                  "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
39            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")
40
41            question_item = item_loader.load_item()
42
43        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,
44                             callback=self.parse_answer)
45        yield question_item

Elmer

26 Feb 2017

1def parse(self, response):
2        l = ItemLoader(item=PlantItem(), response=response)
3
4        l.add_xpath('name', "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/h2/text()")
5        l.add_xpath('species', "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/div[@class='clear resultSpecies']/text()")
6        l.add_xpath('key', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-key']/text()")
7        l.add_xpath('value', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/child::node()")
8        # l.add_xpath('value', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/a/text()")
9
10        return l.load_item()

Raphael

27 Aug 2019

1def parse_song_list(self, response):
2        selector = Selector(response)
3
4        song_name_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/text()').extract()
5        song_id_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/@href').extract()
6        title = selector.xpath('//title/text()').extract()
7        for index, id_ in enumerate(song_id_list):
8            l = ItemLoader(item=PlayListItem())
9            l.add_value('song_name', song_name_list[index])
10            l.add_value('title', title)
11            yield scrapy.FormRequest(url=self.BASE_URL + id_, meta={'song_id': id_[9:], 'loader': l}, method='GET',
12                                     headers=self.headers, callback=self.parse_single_song)

Leven

10 Jun 2018

1def parse(self, response):
2
3        for outer in response.css('#comapreTable tr:not(:first-child)'):
4
5            if outer.css('td[align="center"]'):
6                ccode = outer.css('td[align="center"]>a::attr(id)').extract_first()
7                cname = outer.css('td[align="center"]>a::text').extract_first()
8
9            for inner in outer.xpath('td[div[@align="left"]/a]'):
10                loader = ItemLoader(item=EolZhuanyeItem(), selector=inner)
11                loader.add_value('ccode', ccode)
12                loader.add_value('cname', cname)
13                loader.add_css('url', 'a::attr(href)', lambda urls: urljoin(self.start_urls[0], urls[0]))
14                loader.add_xpath('code', 'following-sibling::td[1]/text()', MapCompose(unicode.strip))
15                loader.add_css('name', 'a::text', MapCompose(unicode.strip))
16                item = loader.load_item()
17
18                yield Request(url=item['url'][0], meta={'item': item}, callback=self.parse_item)

Daniel

19 Jan 2017

1def parse_news(self, response):
2        self.logger.info('parse_news: %s' % response)
3
4        # Initialize item loader
5        # extract news title, published_at, author, content, url
6        loader = ItemLoader(item=News(), response=response)
7        loader.add_value('url', response.url)
8
9        title_selectors = response.css('h1[itemprop="headline"]::text')
10        if not title_selectors:
11            # Will be dropped on the item pipeline
12            return loader.load_item()
13        title = title_selectors.extract()[0]
14        loader.add_value('title', title)
15
16
17        author_name_selectors = response.css('a[rel="author"] > span::text')
18        if not author_name_selectors:
19            loader.add_value('author_name', '')
20        else:
21            author_name = author_name_selectors.extract()[0]
22            loader.add_value('author_name', author_name)
23
24        raw_content_selectors = response.css('.content')
25        if not raw_content_selectors:
26            # Will be dropped on the item pipeline
27            return loader.load_item()
28        raw_content = raw_content_selectors.extract()
29        raw_content = ' '.join([w.strip() for w in raw_content])
30        raw_content = raw_content.strip()
31        loader.add_value('raw_content', raw_content)
32
33        date_time_str_selectors = response.css('article > div.time::text')
34        if not date_time_str_selectors:
35            # Will be dropped on the item pipeline
36            return loader.load_item()
37
38        # Parse date information
39        # Example: Selasa,  6 Oktober 2015 - 05:23 WIB
40        date_time_str = date_time_str_selectors.extract()[0]
41        date_time_str = date_time_str.split(',')[1].strip()[:-4]
42        date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')])
43        try:
44            published_at_wib = datetime.strptime(date_time_str, '%d %B %Y - %H:%M')
45        except ValueError:
46            # Will be dropped on the item pipeline
47            return loader.load_item()
48        published_at = wib_to_utc(published_at_wib)
49        loader.add_value('published_at', published_at)
50
51        # Move scraped news to pipeline
52        return loader.load_item()

Alessandro

28 Nov 2020

1def parse_item(self, response):
2        """
3        Extract fields from the individual email page and load them into the
4        item.
5
6        @url http://lkml.iu.edu/hypermail/linux/kernel/0111.3/0036.html
7        @returns items 1 1
8        @scrapes senderName senderEmail timeSent timeReceived subject body
9        @scrapes replyto url
10        """
11
12        load = ItemLoader(item=Email(), selector=response)
13
14        # Take care of easy fields first
15        load.add_value('url', response.url)
16
17        pattern_replyto = '//ul[1]/li[contains((b|strong), "In reply to:")]'
18        pattern_replyto += '/a/@href'
19        link = response.xpath(pattern_replyto).extract()
20        link = [''] if not link else link
21
22        load.add_value('replyto', link[0])
23
24        # Sometime in 2003, the archive changes and the email pages
25        # require specific procedure to extract the following fields:
26        specific_fields = {
27            'senderName': None,
28            'senderEmail': None,
29            'timeSent': None,
30            'timeReceived': None,
31            'subject': None
32        }
33
34        # Detect new archive system with HTML comment
35        new_system = response.xpath('/comment()[1][contains(., "MHonArc")]')
36
37        if len(new_system) >= 1:
38            # If new archive system is detected...
39            specific_fields = self.parse_new_system(response, specific_fields)
40            body_before_comment = '<!--X-Body-of-Message-->'
41            body_after_comment = '<!--X-Body-of-Message-End-->'
42        else:
43            # Otherwise...
44            specific_fields = self.parse_old_system(response, specific_fields)
45            body_before_comment = '<!-- body="start" -->'
46            body_after_comment = '<!-- body="end" -->'
47
48        # Load all the values from these specific fields
49        for key, val in specific_fields.items():
50            load.add_value(key, val)
51
52        if self.get_body:
53            # Final field, the body of the email
54            pattern_body = body_before_comment + '\n?(.*)' + body_after_comment
55
56            # Ignore invalid bytes when necessary
57            page_body = response.body.decode('utf-8', 'ignore')
58            body = re.search(pattern_body, page_body, flags=re.S)
59            load.add_value('body', body.group(1))
60
61        return load.load_item()

Declan

09 Oct 2020

1def parse_question(self, response):
2        # ??question????????????question item
3        question_id = response.meta.get("zhihu_id", "")
4        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
5        item_loader.add_css("title", "h1.QuestionHeader-title::text")
6        item_loader.add_css("content", ".QuestionHeader-detail")
7        item_loader.add_value("url", response.url)
8        item_loader.add_value("zhihu_id", question_id)
9        item_loader.add_css("answer_num", ".List-headerText span::text")
10        item_loader.add_css("comments_num", ".QuestionHeader-actions button::text")
11        item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
12        item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")
13
14        question_item = item_loader.load_item()
15
16        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer)
17        yield question_item

Jesús

07 Nov 2018

1def parse_question(self, response):
2        question_pattern = re.compile('(.*zhihu.com/question/(\d+))(/|$).*')
3        match_object = re.match(question_pattern, response.url)
4        question_id = match_object.group(2)
5        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
6        item_loader.add_value('zhihu_id', question_id)
7        item_loader.add_css('title', 'h1.QuestionHeader-title::text')
8        item_loader.add_css('topics', '.TopicLink .Popover div::text')
9        item_loader.add_value('url', response.url)
10        item_loader.add_css('content', '.QuestionHeader-detail div div span::text')
11        item_loader.add_css('answer_num', '.List-headerText span::text')
12        item_loader.add_css('comments_num', '.QuestionHeader-Comment button::text')
13        item_loader.add_css('watch_user_num', '.NumberBoard-value::text')
14
15        item = item_loader.load_item()
16        yield item
17        yield scrapy.Request(self.start_answer_url.format(question_id=question_id, offset=0, limit=20),
18                             headers=self.headers, callback=self.parse_answer)

Franco

29 Aug 2019

1def parse_item(self, response):
2
3        loader = ItemLoader(ChsiDaxueItem(), response)
4        loader.add_value('id', response.url, re=ur'schId-(\w+)\.dhtml')
5        loader.add_value('url', response.url)
6        loader.add_css('logo', u'.r_c_sch_logo>img::attr(src)', MapCompose(lambda url: urljoin('http://gaokao.chsi.com.cn/', url)))
7        loader.add_css('name', u'.topImg::text')
8        loader.add_css('badges', u'.r_c_sch_attr .r_c_sch_icon::attr(title)')
9
10        data_clean = MapCompose(lambda x: re.sub(r'\s+', ' ', x), unicode.strip)
11        loader.add_xpath('type', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
12        loader.add_xpath('membership', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
13        loader.add_xpath('province', u'//span[@class="f_bold" and span]/following-sibling::text()', data_clean)
14        loader.add_xpath('address', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
15        loader.add_xpath('phone', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
16        loader.add_xpath('website', u'//span[@class="f_bold" and .="?????"]/following-sibling::a/@href', data_clean)
17        loader.add_xpath('backdoor', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
18
19        def parse_votes():
20            xpath = u'//td[@class="tdMydT" and .="{}"]/following-sibling::td/div[@class="rank"]/@rank'
21            get_vote = lambda what: float(response.xpath(xpath.format(what)).extract_first() or 0)
22            return {
23                'overall': get_vote(u'?????'),
24                'environment': get_vote(u'???????'),
25                'life': get_vote(u'?????'),
26            }
27
28        loader.add_value('votes', parse_votes())
29
30        def parse_trending():
31            css = u'{}>table tr:not(:first-child)'
32            def get_trending(what):
33                majors = []
34                for e in response.css(css.format(what)):
35                    majors.append({
36                        'id': e.css(u'.tdZytjTDiv>a::attr(href)').re_first(r'specId=(\w+)'),
37                        'name': e.css(u'.tdZytjTDiv::attr(title)').extract_first(),
38                        'vote': float(e.css(u'.avg_rank::text').extract_first()),
39                        'count': int(e.css(u'.c_f00::text, .red::text').extract_first()),
40                    })
41                return majors
42            return {
43                'count': get_trending(u'#topNoofPTable'),
44                'index': get_trending(u'#topIndexTable'),
45                'like': get_trending(u'.r_r_box_zymyd'),
46            }
47
48        loader.add_value('trending', parse_trending())
49
50        item = loader.load_item()
51
52        for link in LinkExtractor(restrict_xpaths=u'//ul[@id="topNav"]//a[.="????"]').extract_links(response):
53            yield Request(link.url, meta={'item': item}, callback=self.parse_jianjie)

Raphael

06 Feb 2018

1def parse_first_page(self, response):
2		count = int(response.xpath('//ul[@class="image"]/text()')[0].re(r'.*?(\d+).*?')[0])
3		title = response.request.cookies['title']
4		albumURL = response.url.replace(".shtml", '')
5		# print u'', count, title, albumURL
6		for x in xrange(1,count+1):
7			suffix = ".shtml"
8			if x > 1:
9				suffix = "_"+str(x)+".shtml"
10				# print u'',albumURL+suffix
11				request = scrapy.Request(albumURL+suffix, callback=self.parse_item, cookies={'title': title})
12				yield request
13				
14		l = ItemLoader(item=PageItem(), response=response)
15		l.add_value('title', title)
16		l.add_value('name', self.name)
17		l.add_value('url', response.url)
18		l.add_xpath('image_urls', '//td[@valign="top"]/img/@src')
19		yield l.load_item()