Skip to content Skip to sidebar Skip to footer

Scrapy Output Item As 1 List Element Per Row

New to scrapy and have looked everywhere over the past week or more for some solution to my problem. I am trying to scrape tabular data for ufc 1 at http://ufcstats.com/event-detai

Solution 1:

I have been working in Scrapy for many years and I find this Item class useless and very confusing, specially for the ones who are new to Scrapy

In your case, you need to iterate over winner and loser elements in a for loop and yield then one by one

class StatsSpider(scrapy.Spider):
    name = 'stats'
    allowed_domains = ['ufcstats.com']
    start_urls = ['http://ufcstats.com/statistics/events/completed?page=all']


    def parse(self, response):
        rev_orderd_events = response.css('tr.b-statistics__table-row')[::-1]

        event_links = rev_orderd_events.css('i>a::attr(href)').extract_first()
        yield scrapy.Request(url=event_links,callback=self.parse_event)

    # follow links
    def parse_event(self, response):
        pg = response.css('div.l-page__container')

        for match in pg:

            event_name = item.css("h2.b-content__title>span::text").extract_first()
            event_date = item.css("ul.b-list__box-list>li:nth-child(1)").extract_first()
            event_loc = item.css("ul.b-list__box-list>li:nth-child(2)::text").extract_first()

            for item in match.css("p.b-fight-details__table-text:nth-child(odd)>a"):
                winner = {}
                winner['name'] = item.css("::text").extract_first()
                winner['type'] = 'winner'
                winner['event_name'] = event_name
                winner['event_date'] = event_date
                winner['event_loc'] = event_loc

                yield winner

            for item in match.css("p.b-fight-details__table-text:nth-child(even)>a"):
                loser = {}
                loser['name'] = item.css("::text").extract_first()
                winner['type'] = 'loser'
                loser['event_name'] = event_name
                loser['event_date'] = event_date
                loser['event_loc'] = event_loc
                yield loser

Solution 2:

thanks @umair and @Catalina_Chircu

def parse_event(self, response):

    pg = response.css('div.l-page__container')

    for event in response.css('div.b-fight-details'):
        event_name = pg.css('h2.b-content__title>span::text').extract_first()
        event_date = event.css('ul.b-list__box-list>li:nth-child(1)::text').extract()
        event_loc  = event.css('ul.b-list__box-list>li:nth-child(2)::text').extract()
        attendance = event.css('ul.b-list__box-list>li:nth-child(3)::text').extract()


        for fights in event.css('tr')[1:]: 
            il = ItemLoader(StatsItem(), selector=fights)
            il.add_value('event_name', event_name)
            il.add_value('event_date', event_date)
            il.add_value('event_loc', event_loc)
            il.add_value('attendance', attendance)
            il.add_css('winner', 'td.b-fight-details__table-col:nth-child(2) p.b-fight-details__table-text:nth-child(odd)>a::text')
            il.add_css('loser', 'td.b-fight-details__table-col:nth-child(2) p.b-fight-details__table-text:nth-child(even)>a::text')
            #il.add_css('f_info', ':nth-child(3) p.b-fight-details__table-text::text')
            il.add_css('w_str' ,'td.b-fight-details__table-col:nth-child(3)>p:nth-child(odd)::text')
            il.add_css('l_str' ,'td.b-fight-details__table-col:nth-child(3)>p:nth-child(even)::text')
            il.add_css('w_td'  ,'td.b-fight-details__table-col:nth-child(4)>p:nth-child(odd)::text')
            il.add_css('l_td'  ,'td.b-fight-details__table-col:nth-child(4)>p:nth-child(even)::text')
            il.add_css('w_sub' ,'td.b-fight-details__table-col:nth-child(5)>p:nth-child(odd)::text')
            il.add_css('l_sub' ,'td.b-fight-details__table-col:nth-child(5)>p:nth-child(even)::text')
            il.add_css('w_pass','td.b-fight-details__table-col:nth-child(6)>p:nth-child(odd)::text')
            il.add_css('l_pass','td.b-fight-details__table-col:nth-child(6)>p:nth-child(even)::text')
            il.add_css('w_wclass','td.b-fight-details__table-col:nth-child(7)>p:nth-child(1)::text')
            il.add_css('l_wclass','td.b-fight-details__table-col:nth-child(7)>p:nth-child(1)::text')
            il.add_css('w_method','td.b-fight-details__table-col:nth-child(8)>p:nth-child(odd)::text')
            il.add_css('l_method','td.b-fight-details__table-col:nth-child(8)>p:nth-child(odd)::text')
            il.add_css('w_mthdtl','td.b-fight-details__table-col:nth-child(8)>p:nth-child(even)::text')
            il.add_css('l_mthdtl','td.b-fight-details__table-col:nth-child(8)>p:nth-child(even)::text')
            il.add_css('w_round','td.b-fight-details__table-col:nth-child(9)>p:nth-child(odd)::text')
            il.add_css('l_round','td.b-fight-details__table-col:nth-child(9)>p:nth-child(odd)::text')
            il.add_css('w_time','td.b-fight-details__table-col:nth-child(10)>p:nth-child(odd)::text')
            il.add_css('l_time','td.b-fight-details__table-col:nth-child(10)>p:nth-child(odd)::text')
            yield il.load_item()'

with associated items input/output processors is giving me most of what I was hoping for


Post a Comment for "Scrapy Output Item As 1 List Element Per Row"