from urllib.parse import urljoin
import requests
import bs4
import datetime
import itertools
import typing
from bs4 import BeautifulSoup
from pytz import timezone
BASE_URL = 'https://afltables.com/afl/'
AEST = timezone('Australia/Melbourne')
def grouper(n, iterable, fillvalue=None):
"""
Chunks an iterable into chunks of size n
"""
args = [iter(iterable)] * n
return itertools.zip_longest(fillvalue=fillvalue, *args)
class MatchException(Exception):
pass
[docs]class Score:
"""
Represents an AFL score for a single team at a given point in time
:ivar goals: Number of goals scored
:ivar behinds: Number of behinds/points scored
"""
goals: int
behinds: int
def __init__(self, goals, behinds):
self.goals = goals
self.behinds = behinds
[docs] @classmethod
def parse(cls, pointstring: str) -> 'Score':
"""
Parses a string in the form x.y
"""
goals, behinds = pointstring.replace('(', '').replace(')', '').split('.')
return Score(int(goals), int(behinds))
@property
def score(self) -> int:
"""
The calculated score as a single integer
"""
return 6 * self.goals + self.behinds
def __str__(self):
return f'{self.goals}.{self.behinds}'
class TeamMatch:
"""
Represents an individual team in an individual match
:ivar name: The name of this team
:ivar scores: A list of Score objects indicating the score of this team at the end of each of the four quarters.
There may be 5 values in the array, in the case of extra time. In all cases, the final value in this array is
the final score for this team
:ivar match: The Match that this round belongs to
"""
name: str
scores: typing.List[Score]
match: 'Match'
def __init__(self, name: str, match: 'Match', scores: typing.List[Score] = []):
self.name = name
self.scores = scores
self.match = match
@property
def final_score(self) -> typing.Optional[Score]:
"""
Returns the final score of this team at the end of the match, or None, if this was a bye
"""
if self.match.bye:
return None
else:
return self.scores[-1]
@classmethod
def parse_bye(cls, name: bs4.Tag, match: 'Match'):
return cls(name=name.text, match=match)
@classmethod
def parse_match(cls, name: bs4.Tag, rounds: bs4.Tag, match: 'Match'):
return cls(name=name.text, scores=[Score.parse(s) for s in rounds.text.split()], match=match)
def __str__(self):
if self.match.bye:
return f'{self.name} Bye'
else:
return f'{self.name} {self.final_score}'
[docs]class Match:
"""
Represents a single match of AFL
:ivar teams: A list of teams, with either two teams or one team (a bye)
:ivar attendees: Number of attendees at this match
:ivar date: The time and date that this match started
:ivar venue: The name of the venue at which this match was played
:ivar winner: The name of the winning team
"""
teams: typing.List[TeamMatch]
attendees: int
date: datetime.datetime
venue: str
winner: str
@staticmethod
def _parse_misc(misc: bs4.Tag) -> dict:
"""
Parse the date/venue/attendees section
"""
date = misc.contents[0]
date_elements = str(date).replace('(', '').replace(')', '').split()
date_str = ' '.join(date_elements[0:2] + date_elements[-2:])
parsed_date = datetime.datetime.strptime(date_str, '%a %d-%b-%Y %I:%M %p').replace(tzinfo=AEST)
ret = {
'date': parsed_date
}
# The misc section has variable items, so we have to parse it dynamically
misc_attr = None
for element in misc.contents[1:]:
if 'Venue' in str(element):
misc_attr = 'venue'
elif 'Att' in str(element):
misc_attr = 'attendees'
elif len(str(element).strip()) > 0:
if misc_attr == 'venue':
ret['venue'] = element.text
elif misc_attr == 'attendees':
ret['attendees'] = int(str(element).replace(',', '').replace(' ', '')),
misc_attr = None
return ret
[docs] @classmethod
def parse(cls, table: bs4.Tag):
"""
Parses a Match from the appropriate <table> element
"""
td = table.find_all('td')
if len(td) == 8:
team_1, team_1_stats, team_1_score, misc, team_2, team_2_stats, team_2_score, winner = td
misc_kwargs = cls._parse_misc(misc)
match = cls(
[],
bye=False,
winner=winner.b.text,
**misc_kwargs
)
match.teams = [
TeamMatch.parse_match(team_1, team_1_stats, match),
TeamMatch.parse_match(team_2, team_2_stats, match)
]
return match
elif len(td) == 2:
match = cls([], bye=True, winner=td[0].text)
match.teams = [TeamMatch.parse_bye(td[0], match)]
return match
else:
raise MatchException('This is invalid markup for a Match object')
def __init__(self,
teams: typing.List[TeamMatch],
winner: str,
attendees: int = None,
date: datetime = None,
venue: str = None,
bye: bool = False):
self.teams = teams
self.attendees = attendees
self.date = date
self.venue = venue
self.bye = bye
self.winner = winner
def __str__(self):
if self.bye:
return f'{self.teams[0].name} vs Bye'
else:
return f'{self.teams[0].name} vs {self.teams[1].name}'
[docs]class Round:
"""
Represents a single round of AFL, with one or more matches being played in that round
:ivar title: The human-readable title for this round
:ivar matches: A list of matches played during this round
"""
title: str
matches: typing.List[Match]
def __init__(self, title: str, matches: list = []):
self.title = title
self.matches = matches
[docs] @classmethod
def parse(cls, title: bs4.Tag, table: bs4.Tag) -> 'Round':
"""
Parses a round from two table elements that define it
:param title: The <table> tag that contains this round's header
:param table: The <table> tag that contains this round's data
"""
title = title.text
if 'Final' in title:
matches = [Match.parse(table)]
else:
matches = []
for match in table.select('td[width="85%"] table'):
try:
matches.append(Match.parse(match))
except MatchException:
continue
return cls(title=title, matches=matches)
def __str__(self):
return self.title
[docs]class MatchScraper:
"""
A static class that can be used to scrape the matches from the AFL Tables website
"""
@staticmethod
def _url(year: int):
"""
Returns the AFL Tables URL for the provided year
"""
return urljoin(BASE_URL, f'seas/{year}.html')
[docs] @classmethod
def scrape(cls, year: int) -> typing.List[Round]:
"""
Scrapes all the match data for the given year
:param year: The year to scrape, e.g. 2015
"""
url = cls._url(year)
rounds = []
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')
# Filter out irrelevant tables
tables = [table for table in soup.select('center > table') if
table.get('class') != ['sortable'] and table.text != 'Finals']
# Group the tables into title, content pairs
for header, body in grouper(2, tables):
title = header.find('td')
rounds.append(Round.parse(title, body))
return rounds