Source code for afl_tables

from urllib.parse import urljoin
import requests
import bs4
import datetime
import itertools
import typing
from bs4 import BeautifulSoup
from pytz import timezone

BASE_URL = 'https://afltables.com/afl/'
AEST = timezone('Australia/Melbourne')


def grouper(n, iterable, fillvalue=None):
    """
    Chunks an iterable into chunks of size n
    """
    args = [iter(iterable)] * n
    return itertools.zip_longest(fillvalue=fillvalue, *args)


class MatchException(Exception):
    pass


[docs]class Score: """ Represents an AFL score for a single team at a given point in time :ivar goals: Number of goals scored :ivar behinds: Number of behinds/points scored """ goals: int behinds: int def __init__(self, goals, behinds): self.goals = goals self.behinds = behinds
[docs] @classmethod def parse(cls, pointstring: str) -> 'Score': """ Parses a string in the form x.y """ goals, behinds = pointstring.replace('(', '').replace(')', '').split('.') return Score(int(goals), int(behinds))
@property def score(self) -> int: """ The calculated score as a single integer """ return 6 * self.goals + self.behinds def __str__(self): return f'{self.goals}.{self.behinds}'
class TeamMatch: """ Represents an individual team in an individual match :ivar name: The name of this team :ivar scores: A list of Score objects indicating the score of this team at the end of each of the four quarters. There may be 5 values in the array, in the case of extra time. In all cases, the final value in this array is the final score for this team :ivar match: The Match that this round belongs to """ name: str scores: typing.List[Score] match: 'Match' def __init__(self, name: str, match: 'Match', scores: typing.List[Score] = []): self.name = name self.scores = scores self.match = match @property def final_score(self) -> typing.Optional[Score]: """ Returns the final score of this team at the end of the match, or None, if this was a bye """ if self.match.bye: return None else: return self.scores[-1] @classmethod def parse_bye(cls, name: bs4.Tag, match: 'Match'): return cls(name=name.text, match=match) @classmethod def parse_match(cls, name: bs4.Tag, rounds: bs4.Tag, match: 'Match'): return cls(name=name.text, scores=[Score.parse(s) for s in rounds.text.split()], match=match) def __str__(self): if self.match.bye: return f'{self.name} Bye' else: return f'{self.name} {self.final_score}'
[docs]class Match: """ Represents a single match of AFL :ivar teams: A list of teams, with either two teams or one team (a bye) :ivar attendees: Number of attendees at this match :ivar date: The time and date that this match started :ivar venue: The name of the venue at which this match was played :ivar winner: The name of the winning team """ teams: typing.List[TeamMatch] attendees: int date: datetime.datetime venue: str winner: str @staticmethod def _parse_misc(misc: bs4.Tag) -> dict: """ Parse the date/venue/attendees section """ date = misc.contents[0] date_elements = str(date).replace('(', '').replace(')', '').split() date_str = ' '.join(date_elements[0:2] + date_elements[-2:]) parsed_date = datetime.datetime.strptime(date_str, '%a %d-%b-%Y %I:%M %p').replace(tzinfo=AEST) ret = { 'date': parsed_date } # The misc section has variable items, so we have to parse it dynamically misc_attr = None for element in misc.contents[1:]: if 'Venue' in str(element): misc_attr = 'venue' elif 'Att' in str(element): misc_attr = 'attendees' elif len(str(element).strip()) > 0: if misc_attr == 'venue': ret['venue'] = element.text elif misc_attr == 'attendees': ret['attendees'] = int(str(element).replace(',', '').replace(' ', '')), misc_attr = None return ret
[docs] @classmethod def parse(cls, table: bs4.Tag): """ Parses a Match from the appropriate <table> element """ td = table.find_all('td') if len(td) == 8: team_1, team_1_stats, team_1_score, misc, team_2, team_2_stats, team_2_score, winner = td misc_kwargs = cls._parse_misc(misc) match = cls( [], bye=False, winner=winner.b.text, **misc_kwargs ) match.teams = [ TeamMatch.parse_match(team_1, team_1_stats, match), TeamMatch.parse_match(team_2, team_2_stats, match) ] return match elif len(td) == 2: match = cls([], bye=True, winner=td[0].text) match.teams = [TeamMatch.parse_bye(td[0], match)] return match else: raise MatchException('This is invalid markup for a Match object')
def __init__(self, teams: typing.List[TeamMatch], winner: str, attendees: int = None, date: datetime = None, venue: str = None, bye: bool = False): self.teams = teams self.attendees = attendees self.date = date self.venue = venue self.bye = bye self.winner = winner def __str__(self): if self.bye: return f'{self.teams[0].name} vs Bye' else: return f'{self.teams[0].name} vs {self.teams[1].name}'
[docs]class Round: """ Represents a single round of AFL, with one or more matches being played in that round :ivar title: The human-readable title for this round :ivar matches: A list of matches played during this round """ title: str matches: typing.List[Match] def __init__(self, title: str, matches: list = []): self.title = title self.matches = matches
[docs] @classmethod def parse(cls, title: bs4.Tag, table: bs4.Tag) -> 'Round': """ Parses a round from two table elements that define it :param title: The <table> tag that contains this round's header :param table: The <table> tag that contains this round's data """ title = title.text if 'Final' in title: matches = [Match.parse(table)] else: matches = [] for match in table.select('td[width="85%"] table'): try: matches.append(Match.parse(match)) except MatchException: continue return cls(title=title, matches=matches)
def __str__(self): return self.title
[docs]class MatchScraper: """ A static class that can be used to scrape the matches from the AFL Tables website """ @staticmethod def _url(year: int): """ Returns the AFL Tables URL for the provided year """ return urljoin(BASE_URL, f'seas/{year}.html')
[docs] @classmethod def scrape(cls, year: int) -> typing.List[Round]: """ Scrapes all the match data for the given year :param year: The year to scrape, e.g. 2015 """ url = cls._url(year) rounds = [] html = requests.get(url).text soup = BeautifulSoup(html, 'html5lib') # Filter out irrelevant tables tables = [table for table in soup.select('center > table') if table.get('class') != ['sortable'] and table.text != 'Finals'] # Group the tables into title, content pairs for header, body in grouper(2, tables): title = header.find('td') rounds.append(Round.parse(title, body)) return rounds