解决方案
1. 使用 Scrapy 框架从网页中提取数据
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class SchemaItem(Item):
date = Field()
teams = Field()
class SchemaSpider(BaseSpider):
name = "schema"
allowed_domains = ["http://stats.swehockey.se/"]
start_urls = ["http://stats.swehockey.se/ScheduleAndResults/Schedule/3940"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[@class="tblContent"]/tr')
for row in rows:
item = SchemaItem()
item['date'] = row.select('.//td[2]/div/span/text()').extract()
item['teams'] = row.select('.//td[3]/text()').extract()
yield item
2. 使用正则表达式过滤出特定的比赛
for fixture in item['teams']:
teams = fixture.split('-') #split it
home_team = teams[0].strip()
away_team = teams[1].strip()
if home_team == "AIK":
for fixDate in item['date']:
year = fixDate[0:4]
month = fixDate[5:7]
day = fixDate[8:10]
hour = fixDate[11:13]
minute = fixDate[14:16]
print year, month, day, hour, minute, home_team, ":", away_team
elif home_team == u"Djurgårdens IF":
for fixDate in item['date']:
year = fixDate[0:4]
month = fixDate[5:7]
day = fixDate[8:10]
hour = fixDate[11:13]
minute = fixDate[14:16]
print year, month, day, hour, minute, home_team, ":", away_team
3. 将比赛信息重新格式化为日历文件
BEGIN:VCALENDAR
VERSION:2.0
PRODID:-//hacksw/handcal//NONSGML v1.0//EN
END:VCALENDAR
4. 将日历文件保存为 .ics 文件
with open('calendar.ics', 'w') as f:
f.write(calendar_text)
最终代码如下:
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class SchemaItem(Item):
date = Field()
teams = Field()
class SchemaSpider(BaseSpider):
name = "schema"
allowed_domains = ["http://stats.swehockey.se/"]
start_urls = ["http://stats.swehockey.se/ScheduleAndResults/Schedule/3940"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[@class="tblContent"]/tr')
for row in rows:
item = SchemaItem()
item['date'] = row.select('.//td[2]/div/span/text()').extract()
item['teams'] = row.select('.//td[3]/text()').extract()
for fixture in item['teams']:
teams = fixture.split('-') #split it
home_team = teams[0].strip()
away_team = teams[1].strip()
if home_team == "AIK":
for fixDate in item['date']:
year = fixDate[0:4]
month = fixDate[5:7]
day = fixDate[8:10]
hour = fixDate[11:13]
minute = fixDate[14:16]
print year, month, day, hour, minute, home_team, ":", away_team
elif home_team == u"Djurgårdens IF":
for fixDate in item['date']:
year = fixDate[0:4]
month = fixDate[5:7]
day = fixDate[8:10]
hour = fixDate[11:13]
minute = fixDate[14:16]
print year, month, day, hour, minute, home_team, ":", away_team
calendar_text = """BEGIN:VCALENDAR
VERSION:2.0
PRODID:-//hacksw/handcal//NONSGML v1.0//EN
"""
for fixture in item['teams']:
teams = fixture.split('-') #split it
home_team = teams[0].strip()
away_team = teams[1].strip()
if home_team == "AIK":
for fixDate in item['date']:
year = fixDate[0:4]
month = fixDate[5:7]
day = fixDate[8:10]
hour = fixDate[11:13]
minute = fixDate[14:16]
calendar_text += """BEGIN:VEVENT
SUMMARY:%s vs %s
DTSTART:%s%s%sT%s%s00Z
DTEND:%s%s%sT%s%s00Z
LOCATION:%s
DESCRIPTION:%s vs %s
END:VEVENT
""" % (home_team, away_team, year, month, day, hour, minute, year, month, day, hour, minute, "HOVET", home_team, away_team)
elif home_team == u"Djurgårdens IF":
for fixDate in item['date']:
year = fixDate[0:4]
month = fixDate[5:7]
day = fixDate[8:10]
hour = fixDate[11:13]
minute = fixDate[14:16]
calendar_text += """BEGIN:VEVENT
SUMMARY:%s vs %s
DTSTART:%s%s%sT%s%s00Z
DTEND:%s%s%sT%s%s00Z
LOCATION:%s
DESCRIPTION:%s vs %s
END:VEVENT
""" % (home_team, away_team, year, month, day, hour, minute, year, month, day, hour, minute, "HOVET", home_team, away_team)
calendar_text += """END:VCALENDAR
"""
with open('calendar.ics', 'w') as f:
f.write(calendar_text)