Scrapy Output Item As 1 List Element Per Row
New to scrapy and have looked everywhere over the past week or more for some solution to my problem. I am trying to scrape tabular data for ufc 1 at http://ufcstats.com/event-detai
Solution 1:
I have been working in Scrapy for many years and I find this Item class useless and very confusing, specially for the ones who are new to Scrapy
In your case, you need to iterate over winner and loser elements in a for loop and yield then one by one
class StatsSpider(scrapy.Spider):
name = 'stats'
allowed_domains = ['ufcstats.com']
start_urls = ['http://ufcstats.com/statistics/events/completed?page=all']
def parse(self, response):
rev_orderd_events = response.css('tr.b-statistics__table-row')[::-1]
event_links = rev_orderd_events.css('i>a::attr(href)').extract_first()
yield scrapy.Request(url=event_links,callback=self.parse_event)
# follow links
def parse_event(self, response):
pg = response.css('div.l-page__container')
for match in pg:
event_name = item.css("h2.b-content__title>span::text").extract_first()
event_date = item.css("ul.b-list__box-list>li:nth-child(1)").extract_first()
event_loc = item.css("ul.b-list__box-list>li:nth-child(2)::text").extract_first()
for item in match.css("p.b-fight-details__table-text:nth-child(odd)>a"):
winner = {}
winner['name'] = item.css("::text").extract_first()
winner['type'] = 'winner'
winner['event_name'] = event_name
winner['event_date'] = event_date
winner['event_loc'] = event_loc
yield winner
for item in match.css("p.b-fight-details__table-text:nth-child(even)>a"):
loser = {}
loser['name'] = item.css("::text").extract_first()
winner['type'] = 'loser'
loser['event_name'] = event_name
loser['event_date'] = event_date
loser['event_loc'] = event_loc
yield loser
Solution 2:
thanks @umair and @Catalina_Chircu
def parse_event(self, response):
pg = response.css('div.l-page__container')
for event in response.css('div.b-fight-details'):
event_name = pg.css('h2.b-content__title>span::text').extract_first()
event_date = event.css('ul.b-list__box-list>li:nth-child(1)::text').extract()
event_loc = event.css('ul.b-list__box-list>li:nth-child(2)::text').extract()
attendance = event.css('ul.b-list__box-list>li:nth-child(3)::text').extract()
for fights in event.css('tr')[1:]:
il = ItemLoader(StatsItem(), selector=fights)
il.add_value('event_name', event_name)
il.add_value('event_date', event_date)
il.add_value('event_loc', event_loc)
il.add_value('attendance', attendance)
il.add_css('winner', 'td.b-fight-details__table-col:nth-child(2) p.b-fight-details__table-text:nth-child(odd)>a::text')
il.add_css('loser', 'td.b-fight-details__table-col:nth-child(2) p.b-fight-details__table-text:nth-child(even)>a::text')
#il.add_css('f_info', ':nth-child(3) p.b-fight-details__table-text::text')
il.add_css('w_str' ,'td.b-fight-details__table-col:nth-child(3)>p:nth-child(odd)::text')
il.add_css('l_str' ,'td.b-fight-details__table-col:nth-child(3)>p:nth-child(even)::text')
il.add_css('w_td' ,'td.b-fight-details__table-col:nth-child(4)>p:nth-child(odd)::text')
il.add_css('l_td' ,'td.b-fight-details__table-col:nth-child(4)>p:nth-child(even)::text')
il.add_css('w_sub' ,'td.b-fight-details__table-col:nth-child(5)>p:nth-child(odd)::text')
il.add_css('l_sub' ,'td.b-fight-details__table-col:nth-child(5)>p:nth-child(even)::text')
il.add_css('w_pass','td.b-fight-details__table-col:nth-child(6)>p:nth-child(odd)::text')
il.add_css('l_pass','td.b-fight-details__table-col:nth-child(6)>p:nth-child(even)::text')
il.add_css('w_wclass','td.b-fight-details__table-col:nth-child(7)>p:nth-child(1)::text')
il.add_css('l_wclass','td.b-fight-details__table-col:nth-child(7)>p:nth-child(1)::text')
il.add_css('w_method','td.b-fight-details__table-col:nth-child(8)>p:nth-child(odd)::text')
il.add_css('l_method','td.b-fight-details__table-col:nth-child(8)>p:nth-child(odd)::text')
il.add_css('w_mthdtl','td.b-fight-details__table-col:nth-child(8)>p:nth-child(even)::text')
il.add_css('l_mthdtl','td.b-fight-details__table-col:nth-child(8)>p:nth-child(even)::text')
il.add_css('w_round','td.b-fight-details__table-col:nth-child(9)>p:nth-child(odd)::text')
il.add_css('l_round','td.b-fight-details__table-col:nth-child(9)>p:nth-child(odd)::text')
il.add_css('w_time','td.b-fight-details__table-col:nth-child(10)>p:nth-child(odd)::text')
il.add_css('l_time','td.b-fight-details__table-col:nth-child(10)>p:nth-child(odd)::text')
yield il.load_item()'
with associated items input/output processors is giving me most of what I was hoping for
Post a Comment for "Scrapy Output Item As 1 List Element Per Row"