如何从网页中提取信息并转化为日历文件

133 阅读2分钟

解决方案

1. 使用 Scrapy 框架从网页中提取数据

from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector


class SchemaItem(Item):
    date = Field()
    teams = Field()


class SchemaSpider(BaseSpider):
    name = "schema"
    allowed_domains = ["http://stats.swehockey.se/"]
    start_urls = ["http://stats.swehockey.se/ScheduleAndResults/Schedule/3940"]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        rows = hxs.select('//table[@class="tblContent"]/tr')

        for row in rows:
            item = SchemaItem()
            item['date'] = row.select('.//td[2]/div/span/text()').extract()
            item['teams'] = row.select('.//td[3]/text()').extract()

            yield item

2. 使用正则表达式过滤出特定的比赛

for fixture in item['teams']:
    teams = fixture.split('-') #split it
    home_team = teams[0].strip()
    away_team = teams[1].strip()
if home_team == "AIK":
    for fixDate in item['date']:
            year = fixDate[0:4]
            month = fixDate[5:7]
            day = fixDate[8:10]
            hour = fixDate[11:13]
            minute = fixDate[14:16]
            print year, month, day, hour, minute, home_team, ":", away_team
elif home_team == u"Djurgårdens IF":
    for fixDate in item['date']:
        year = fixDate[0:4]
        month = fixDate[5:7]
        day = fixDate[8:10]
        hour = fixDate[11:13]
        minute = fixDate[14:16]
        print year, month, day, hour, minute, home_team, ":", away_team

3. 将比赛信息重新格式化为日历文件

BEGIN:VCALENDAR
VERSION:2.0
PRODID:-//hacksw/handcal//NONSGML v1.0//EN
END:VCALENDAR

4. 将日历文件保存为 .ics 文件

with open('calendar.ics', 'w') as f:
    f.write(calendar_text)

最终代码如下:

from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector


class SchemaItem(Item):
    date = Field()
    teams = Field()


class SchemaSpider(BaseSpider):
    name = "schema"
    allowed_domains = ["http://stats.swehockey.se/"]
    start_urls = ["http://stats.swehockey.se/ScheduleAndResults/Schedule/3940"]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        rows = hxs.select('//table[@class="tblContent"]/tr')

        for row in rows:
            item = SchemaItem()
            item['date'] = row.select('.//td[2]/div/span/text()').extract()
            item['teams'] = row.select('.//td[3]/text()').extract()

            for fixture in item['teams']:
                teams = fixture.split('-') #split it
                home_team = teams[0].strip()
                away_team = teams[1].strip()

                if home_team == "AIK":
                    for fixDate in item['date']:
                            year = fixDate[0:4]
                            month = fixDate[5:7]
                            day = fixDate[8:10]
                            hour = fixDate[11:13]
                            minute = fixDate[14:16]
                            print year, month, day, hour, minute, home_team, ":", away_team
                elif home_team == u"Djurgårdens IF":
                    for fixDate in item['date']:
                        year = fixDate[0:4]
                        month = fixDate[5:7]
                        day = fixDate[8:10]
                        hour = fixDate[11:13]
                        minute = fixDate[14:16]
                        print year, month, day, hour, minute, home_team, ":", away_team

calendar_text = """BEGIN:VCALENDAR
VERSION:2.0
PRODID:-//hacksw/handcal//NONSGML v1.0//EN
"""

for fixture in item['teams']:
    teams = fixture.split('-') #split it
    home_team = teams[0].strip()
    away_team = teams[1].strip()

    if home_team == "AIK":
        for fixDate in item['date']:
                year = fixDate[0:4]
                month = fixDate[5:7]
                day = fixDate[8:10]
                hour = fixDate[11:13]
                minute = fixDate[14:16]

                calendar_text += """BEGIN:VEVENT
SUMMARY:%s vs %s
DTSTART:%s%s%sT%s%s00Z
DTEND:%s%s%sT%s%s00Z
LOCATION:%s
DESCRIPTION:%s vs %s
END:VEVENT
""" % (home_team, away_team, year, month, day, hour, minute, year, month, day, hour, minute, "HOVET", home_team, away_team)
    elif home_team == u"Djurgårdens IF":
        for fixDate in item['date']:
            year = fixDate[0:4]
            month = fixDate[5:7]
            day = fixDate[8:10]
            hour = fixDate[11:13]
            minute = fixDate[14:16]

            calendar_text += """BEGIN:VEVENT
SUMMARY:%s vs %s
DTSTART:%s%s%sT%s%s00Z
DTEND:%s%s%sT%s%s00Z
LOCATION:%s
DESCRIPTION:%s vs %s
END:VEVENT
""" % (home_team, away_team, year, month, day, hour, minute, year, month, day, hour, minute, "HOVET", home_team, away_team)

calendar_text += """END:VCALENDAR
"""

with open('calendar.ics', 'w') as f:
    f.write(calendar_text)