Friday, March 3, 2017

soccerway.com App in Python

This Find All Match Detail Related To soccerway.com....…………………

1. # This spider extracts the info (teams, date-time, score) related to soccer matches from soccerway.com,
2.
3. """ soccerway.com spider to scrape soccer matches info """
4.
5. from datetime import datetime
6.
7. from scrapy.http import Request
8. from scrapy.selector import HtmlXPathSelector
9. from scrapy.contrib.spiders import CrawlSpider
10. from scrapy.item import Item, Field
11. from scrapy.contrib.loader import XPathItemLoader
12. from scrapy.contrib.loader.processor import TakeFirst, MapCompose
13.
14.
15. class Match( Item):
16. """ Item to load with scraped data """
17. cup = Field ()
18. team1 = Field ()
19. team2 = Field ()
20. goals1 = Field ()
21. goals2 = Field ()
22. date = Field ()
23. time = Field ()
24.
25.
26. class MatchLoader ( XPathItemLoader ):
27. """ Loader to make the exctraction easier """
28. default_item_class = Match
29. default_output_processor = TakeFirst ()
30. team1_in = MapCompose (lambda x:x.strip ())
31. team2_in = MapCompose (lambda x:x.strip ())
32.
33.
34. class SoccerwaySpider ( CrawlSpider ):
35. """ The spider, you can define the cups you want to exctract """
36. name = 'soccerway'
37. allowed_domains = [ 'soccerway.com' ]
38. cups = [
39. ( 'brazil-2010' ,
40. 'national/brazil/serie-a/2010/regular-season/matches/' ),
41. ]
42.
43. def start_requests (self):
44. for cup, url in self.cups:
45. yield Request ( 'http://www.soccerway.com/%s' % url,
46. self.parse_matches, meta= {'cup' : cup })
47.
48. def parse_matches (self, response ):
49. """ Parse the matches in a fixture listing """
50. cup = response.request.meta ['cup' ]
51.
52. xs, dt, dat = HtmlXPathSelector ( response), None, None
53.
54. for tr in xs. select ( '//table[starts-with(@class,"matches")]'
55. '/tbody/tr[not(contains(@class,"aggr"))]' ):
56. mi = MatchLoader ( selector=tr, response=response )
57. mi.add_value ( 'cup' , cup )
58. mi.add_xpath ( 'team1' , 'td[3]/a/text()' )
59. mi.add_xpath ( 'team2' , 'td[5]/a/text()' )
60.
61. # Match status info
62. sct = [x.strip () for x in tr. select ('td[4]//text()' ).extract () \
63. if x.strip ()]
64. sct = sct [1 ] if len (sct ) > 1 else sct [ 0]
65.
66. # Extract timestamp info
67. day = tr. select ('td[1]/span/text()' ) .extract ()
68. if day:
69. dat = datetime.strptime('%s %s' % ( day [0 ], tr. select (
70. 'td[2]/span/text()' ).extract ()[ 0]) , '%a %d/%m/%y' )
71. if dat:
72. mi.add_value ( 'date' , dat.strftime ('%Y-%m-%d' ))
73.
74. # If not postponed, take the time
75. if sct not in ('PSTP' , '-' ):
76. dt = datetime.fromtimestamp (float (
77. tr. select ( 'td[1]/span/@data-value' ) .extract ()[ 0]))
78. else:
79. dt = None
80. if dt:
81. mi.replace_value ( 'date' , dt.strftime ('%Y-%m-%d' ))
82. mi.add_value ( 'time' , dt.strftime ('%H:%M' ))
83.
84. # If played, scrape the result
85. if '-' in sct:
86. goals = [s.strip () for s in sct.split ( '-' )] # 1) strip
87. goals = [int ( s) for s in goals if s ] # 2) convert to int
88. if len (goals) == 2 :
89. mi.add_value ( 'goals1' , str ( goals[ 0]))
90. mi.add_value ( 'goals2' , str ( goals[ 1]))
91.
92. yield mi.load_item ()
93.
94. SPIDER = SoccerwaySpider ()
95.
96. # Snippet imported from snippets.scrapy.org (which no longer works)
97. # author: vivek kumar
98. # date  : Aug 10, 2017

No comments:

Post a Comment