import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import pingouin as pg
from lets_plot import *
from datetime import datetime


LetsPlot.setup_html(no_js=True)

### You don't need to use these settings yourself,
### they are just here to make the charts look nicer!
# Set the plot style for prettier charts:
plt.style.use(
    "C:/Users/msassrb2/The University of Manchester Dropbox/Ralf Becker/Python/Football/plot_style.txt"
)


df2425 = pd.read_csv(
    "https://www.football-data.co.uk/mmz4281/2324/E0.csv",
    na_values="",
)


df2324 = pd.read_csv(
    "https://www.football-data.co.uk/mmz4281/2324/E0.csv",
    na_values="",
)

df2223 = pd.read_csv(
    "https://www.football-data.co.uk/mmz4281/2223/E0.csv",
    na_values="",
)

df2122 = pd.read_csv(
    "https://www.football-data.co.uk/mmz4281/2122/E0.csv",
    na_values="",
)


# df2324.columns
# df2324.shape
# df2324.info
# df2324.head
# df2324.tail
df2324.dtypes

Div           object
Date          object
Time          object
HomeTeam      object
AwayTeam      object
FTHG           int64
FTAG           int64
FTR           object
HTHG           int64
HTAG           int64
HTR           object
Referee       object
HS             int64
AS             int64
HST            int64
AST            int64
HF             int64
AF             int64
HC             int64
AC             int64
HY             int64
AY             int64
HR             int64
AR             int64
B365H        float64
B365D        float64
B365A        float64
BWH          float64
BWD          float64
BWA          float64
IWH          float64
IWD          float64
IWA          float64
PSH          float64
PSD          float64
PSA          float64
WHH          float64
WHD          float64
WHA          float64
VCH          float64
VCD          float64
VCA          float64
MaxH         float64
MaxD         float64
MaxA         float64
AvgH         float64
AvgD         float64
AvgA         float64
B365>2.5     float64
B365<2.5     float64
P>2.5        float64
P<2.5        float64
Max>2.5      float64
Max<2.5      float64
Avg>2.5      float64
Avg<2.5      float64
AHh          float64
B365AHH      float64
B365AHA      float64
PAHH         float64
PAHA         float64
MaxAHH       float64
MaxAHA       float64
AvgAHH       float64
AvgAHA       float64
B365CH       float64
B365CD       float64
B365CA       float64
BWCH         float64
BWCD         float64
BWCA         float64
IWCH         float64
IWCD         float64
IWCA         float64
PSCH         float64
PSCD         float64
PSCA         float64
WHCH         float64
WHCD         float64
WHCA         float64
VCCH         float64
VCCD         float64
VCCA         float64
MaxCH        float64
MaxCD        float64
MaxCA        float64
AvgCH        float64
AvgCD        float64
AvgCA        float64
B365C>2.5    float64
B365C<2.5    float64
PC>2.5       float64
PC<2.5       float64
MaxC>2.5     float64
MaxC<2.5     float64
AvgC>2.5     float64
AvgC<2.5     float64
AHCh         float64
B365CAHH     float64
B365CAHA     float64
PCAHH        float64
PCAHA        float64
MaxCAHH      float64
MaxCAHA      float64
AvgCAHH      float64
AvgCAHA      float64
dtype: object


df2324.iloc[1,:]

Div                     E0
Date            12/08/2023
Time                 12:30
HomeTeam           Arsenal
AwayTeam     Nott'm Forest
FTHG                     2
FTAG                     1
FTR                      H
HTHG                     2
HTAG                     0
HTR                      H
Referee           M Oliver
HS                      15
AS                       6
HST                      7
AST                      2
HF                      12
AF                      12
HC                       8
AC                       3
HY                       2
AY                       2
HR                       0
AR                       0
B365H                 1.18
B365D                  7.0
B365A                 15.0
BWH                   1.17
BWD                    7.5
BWA                   15.5
IWH                    1.2
IWD                   7.25
IWA                   14.0
PSH                   1.18
PSD                   7.86
PSA                  15.67
WHH                   1.12
WHD                    6.5
WHA                   12.0
VCH                   1.14
VCD                    7.5
VCA                   17.0
MaxH                  1.21
MaxD                   8.5
MaxA                  17.5
AvgH                  1.18
AvgD                  7.64
AvgA                 15.67
B365>2.5              1.44
B365<2.5              2.75
P>2.5                 1.42
P<2.5                 2.93
Max>2.5               1.45
Max<2.5               2.98
Avg>2.5               1.42
Avg<2.5               2.85
AHh                   -2.0
B365AHH               1.88
B365AHA               2.02
PAHH                  1.88
PAHA                  2.01
MaxAHH                1.91
MaxAHA                2.06
AvgAHH                1.87
AvgAHA                1.99
B365CH                1.18
B365CD                 7.0
B365CA                15.0
BWCH                  1.18
BWCD                   7.0
BWCA                  14.5
IWCH                   1.2
IWCD                   7.0
IWCA                  14.0
PSCH                  1.19
PSCD                   8.0
PSCA                  16.0
WHCH                  1.12
WHCD                   6.5
WHCA                  12.0
VCCH                  1.22
VCCD                   7.0
VCCA                  13.0
MaxCH                 1.22
MaxCD                  8.4
MaxCA                 19.0
AvgCH                 1.19
AvgCD                 7.43
AvgCA                15.98
B365C>2.5              1.5
B365C<2.5             2.63
PC>2.5                1.49
PC<2.5                2.65
MaxC>2.5              1.52
MaxC<2.5              2.79
AvgC>2.5              1.49
AvgC<2.5              2.63
AHCh                  -2.0
B365CAHH              1.95
B365CAHA              1.98
PCAHH                 1.93
PCAHA                 1.97
MaxCAHH               2.01
MaxCAHA               2.09
AvgCAHH               1.95
AvgCAHA               1.92
Name: 1, dtype: object


pd.options.display.max_rows = 200
df2324.iloc[1,:]

Div                     E0
Date            12/08/2023
Time                 12:30
HomeTeam           Arsenal
AwayTeam     Nott'm Forest
FTHG                     2
FTAG                     1
FTR                      H
HTHG                     2
HTAG                     0
HTR                      H
Referee           M Oliver
HS                      15
AS                       6
HST                      7
AST                      2
HF                      12
AF                      12
HC                       8
AC                       3
HY                       2
AY                       2
HR                       0
AR                       0
B365H                 1.18
B365D                  7.0
B365A                 15.0
BWH                   1.17
BWD                    7.5
BWA                   15.5
IWH                    1.2
IWD                   7.25
IWA                   14.0
PSH                   1.18
PSD                   7.86
PSA                  15.67
WHH                   1.12
WHD                    6.5
WHA                   12.0
VCH                   1.14
VCD                    7.5
VCA                   17.0
MaxH                  1.21
MaxD                   8.5
MaxA                  17.5
AvgH                  1.18
AvgD                  7.64
AvgA                 15.67
B365>2.5              1.44
B365<2.5              2.75
P>2.5                 1.42
P<2.5                 2.93
Max>2.5               1.45
Max<2.5               2.98
Avg>2.5               1.42
Avg<2.5               2.85
AHh                   -2.0
B365AHH               1.88
B365AHA               2.02
PAHH                  1.88
PAHA                  2.01
MaxAHH                1.91
MaxAHA                2.06
AvgAHH                1.87
AvgAHA                1.99
B365CH                1.18
B365CD                 7.0
B365CA                15.0
BWCH                  1.18
BWCD                   7.0
BWCA                  14.5
IWCH                   1.2
IWCD                   7.0
IWCA                  14.0
PSCH                  1.19
PSCD                   8.0
PSCA                  16.0
WHCH                  1.12
WHCD                   6.5
WHCA                  12.0
VCCH                  1.22
VCCD                   7.0
VCCA                  13.0
MaxCH                 1.22
MaxCD                  8.4
MaxCA                 19.0
AvgCH                 1.19
AvgCD                 7.43
AvgCA                15.98
B365C>2.5              1.5
B365C<2.5             2.63
PC>2.5                1.49
PC<2.5                2.65
MaxC>2.5              1.52
MaxC<2.5              2.79
AvgC>2.5              1.49
AvgC<2.5              2.63
AHCh                  -2.0
B365CAHH              1.95
B365CAHA              1.98
PCAHH                 1.93
PCAHA                 1.97
MaxCAHH               2.01
MaxCAHA               2.09
AvgCAHH               1.95
AvgCAHA               1.92
Name: 1, dtype: object


df2324.index

RangeIndex(start=0, stop=380, step=1)


# creates Season variable in each dataframe
# insert the variable in position 1, i.e. 2nd variable
df2122.insert(1, 'Season', "2122")
df2223.insert(1, 'Season', "2223")
df2324.insert(1, 'Season', "2324")

# add Match number variable, e.g. "202122_4" is the 5th match in the 202122 season
# insert the variable as the 3rd variable
df2122["MNo"] = range(380)
df2122.insert(2,'MatchNo', df2122["Season"] + "_" + df2122["MNo"].astype("str"))
df2223["MNo"] = range(380)
df2223.insert(2,'MatchNo', df2223["Season"] + "_" + df2223["MNo"].astype("str"))
df2324["MNo"] = range(380)
df2324.insert(2,'MatchNo', df2324["Season"] + "_" + df2324["MNo"].astype("str"))

# combines the dataframe
df_combined = pd.concat([df2122,df2223,df2324])


df_matches = df_combined.iloc[:,0:26]
# df_matches = df_combined.loc[:,"Div": "AR"]  # both lines achieve the same
df_matches.shape

(1140, 26)


df_matches["HomeTeam"] = df_matches["HomeTeam"].astype("category")
df_matches["AwayTeam"] = df_matches["AwayTeam"].astype("category")
df_matches["Referee"] = df_matches["Referee"].astype("category")
df_matches["FTR"] = df_matches["FTR"].astype("category")
df_matches["HTR"] = df_matches["HTR"].astype("category")
df_matches.dtypes

Div           object
Season        object
MatchNo       object
Date          object
Time          object
HomeTeam    category
AwayTeam    category
FTHG           int64
FTAG           int64
FTR         category
HTHG           int64
HTAG           int64
HTR         category
Referee     category
HS             int64
AS             int64
HST            int64
AST            int64
HF             int64
AF             int64
HC             int64
AC             int64
HY             int64
AY             int64
HR             int64
AR             int64
dtype: object


df_matches['Date'] = pd.to_datetime(df_combined['Date'], format = '%d/%m/%Y')
df_matches['Time'] = pd.to_datetime(df_combined['Time'], format = '%H:%M')
df_matches.dtypes

Div                 object
Season              object
MatchNo             object
Date        datetime64[ns]
Time        datetime64[ns]
HomeTeam          category
AwayTeam          category
FTHG                 int64
FTAG                 int64
FTR               category
HTHG                 int64
HTAG                 int64
HTR               category
Referee           category
HS                   int64
AS                   int64
HST                  int64
AST                  int64
HF                   int64
AF                   int64
HC                   int64
AC                   int64
HY                   int64
AY                   int64
HR                   int64
AR                   int64
dtype: object


df_home = df_matches.copy()  
df_away = df_matches.copy()
# add the HomeAway variable
df_home.insert(5,'HomeAway', "Home")
df_away.insert(5,'HomeAway', "Away")

df_away.columns

Index(['Div', 'Season', 'MatchNo', 'Date', 'Time', 'HomeAway', 'HomeTeam',
       'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'Referee',
       'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR',
       'AR'],
      dtype='object')


# variable names not mentioned in the rename method stay the same
df_home = df_home.rename(columns={'HomeTeam': 'Team', 'AwayTeam': 'Opponent', 'FTHG': 'FTG', 'FTAG': 'FTG_Opp', 'FTR': 'FTResult', 
                        'HTHG': 'HTG', 'HTAG': 'HTG_Opp', 'HTR': 'HTResult', 'HS': 'Shots', 'AS': 'Shots_Opp', 
                        'HST': 'ShotsTarget', 'AST': 'ShotsTarget_Opp', 'HF': 'FoulsComm', 'AF': 'FoulsComm_Opp', 
                        'HC': 'Corners', 'AC': 'Corners_Opp', 'HY': 'Yellow', 'AY': 'Yellow_Opp', 'HR': 'Red', 'AR': 'Red_Opp'})


df_away = df_away.rename(columns={'HomeTeam': 'Opponent', 'AwayTeam': 'Team', 'FTHG': 'FTG_Opp', 'FTAG': 'FTG', 'FTR': 'FTResult', 
                        'HTHG': 'HTG_Opp', 'HTAG': 'HTG', 'HTR': 'HTResult', 'HS': 'Shots_Opp', 'AS': 'Shots', 
                        'HST': 'ShotsTarget_Opp', 'AST': 'ShotsTarget', 'HF': 'FoulsComm_Opp', 'AF': 'FoulsComm', 
                        'HC': 'Corners_Opp', 'AC': 'Corners', 'HY': 'Yellow_Opp', 'AY': 'Yellow', 'HR': 'Red_Opp', 'AR': 'Red'})


df_home['FTResult'] = df_home['FTResult'].cat.rename_categories({'H': 'W', 'D': 'D', 'A': 'L'})
df_home['HTResult'] = df_home['HTResult'].cat.rename_categories({'H': 'W', 'D': 'D', 'A': 'L'})
df_away['FTResult'] = df_away['FTResult'].cat.rename_categories({'H': 'L', 'D': 'D', 'A': 'W'})
df_away['HTResult'] = df_away['HTResult'].cat.rename_categories({'H': 'L', 'D': 'D', 'A': 'W'})


df_teams = pd.concat([df_home,df_away])
df_teams = df_teams.sort_values(by=['MatchNo']) # sort by MatchNo
df_teams.head

<bound method NDFrame.head of    Div Season  MatchNo       Date                Time HomeAway           Team  \
0   E0   2122   2122_0 2021-08-13 1900-01-01 20:00:00     Home      Brentford   
0   E0   2122   2122_0 2021-08-13 1900-01-01 20:00:00     Away        Arsenal   
1   E0   2122   2122_1 2021-08-14 1900-01-01 12:30:00     Away          Leeds   
1   E0   2122   2122_1 2021-08-14 1900-01-01 12:30:00     Home     Man United   
10  E0   2122  2122_10 2021-08-21 1900-01-01 12:30:00     Away        Burnley   
..  ..    ...      ...        ...                 ...      ...            ...   
97  E0   2324  2324_97 2023-10-29 1900-01-01 14:00:00     Home       Brighton   
98  E0   2324  2324_98 2023-10-29 1900-01-01 14:00:00     Away  Nott'm Forest   
98  E0   2324  2324_98 2023-10-29 1900-01-01 14:00:00     Home      Liverpool   
99  E0   2324  2324_99 2023-10-29 1900-01-01 15:30:00     Home     Man United   
99  E0   2324  2324_99 2023-10-29 1900-01-01 15:30:00     Away       Man City   

         Opponent  FTG  FTG_Opp  ... ShotsTarget  ShotsTarget_Opp  FoulsComm  \
0         Arsenal    2        0  ...           3                4         12   
0       Brentford    0        2  ...           4                3          8   
1      Man United    1        5  ...           3                8          9   
1           Leeds    5        1  ...           8                3         11   
10      Liverpool    0        2  ...           3                9         12   
..            ...  ...      ...  ...         ...              ...        ...   
97         Fulham    1        1  ...           7                5         12   
98      Liverpool    0        3  ...           1                8         13   
98  Nott'm Forest    3        0  ...           8                1          9   
99       Man City    0        3  ...           3               10          9   
99     Man United    3        0  ...          10                3          4   

   FoulsComm_Opp Corners  Corners_Opp  Yellow  Yellow_Opp  Red  Red_Opp  
0              8       2            5       0           0    0        0  
0             12       5            2       0           0    0        0  
1             11       4            5       2           1    0        0  
1              9       5            4       1           2    0        0  
10             6       4            8       0           0    0        0  
..           ...     ...          ...     ...         ...  ...      ...  
97             8       7            3       0           3    0        0  
98             9       3            8       3           2    0        0  
98            13       8            3       2           3    0        0  
99             4       7           12       4           1    0        0  
99             9      12            7       1           4    0        0  

[2280 rows x 27 columns]>


df_matches[['FTHG', 'FTAG']].describe()


df_matches['FTG'] = df_matches['FTHG'] + df_matches['FTAG']


tab1 = df_matches.groupby('Time')['FTG'].agg([np.size,np.mean, np.var])
tab1


# create cut-offs. As the Time variable is in Time format these need to have the same format
time_cuts = pd.to_datetime(['11:00','14:45','15:50','19:15','23:00'], format = '%H:%M')
df_matches["TimeofDay"] = pd.cut(
    df_matches["Time"],
    bins=time_cuts,
    labels=["noon", "early afternoon", "late afternoon", "evening"],
    ordered=True,
)


df_matches[['MatchNo','Time','TimeofDay']].head

<bound method NDFrame.head of       MatchNo                Time        TimeofDay
0      2122_0 1900-01-01 20:00:00          evening
1      2122_1 1900-01-01 12:30:00             noon
2      2122_2 1900-01-01 15:00:00  early afternoon
3      2122_3 1900-01-01 15:00:00  early afternoon
4      2122_4 1900-01-01 15:00:00  early afternoon
..        ...                 ...              ...
375  2324_375 1900-01-01 16:00:00   late afternoon
376  2324_376 1900-01-01 16:00:00   late afternoon
377  2324_377 1900-01-01 16:00:00   late afternoon
378  2324_378 1900-01-01 16:00:00   late afternoon
379  2324_379 1900-01-01 16:00:00   late afternoon

[1140 rows x 3 columns]>


tab2 = df_matches.groupby('TimeofDay')['FTG'].agg([np.size,np.mean, np.var])
tab2


df_matches['HG_eff'] = df_matches['HS']/df_matches['FTHG'] 
df_matches['AG_eff'] = df_matches['AS']/df_matches['FTAG'] 
df_matches[['FTHG', 'HS', 'HG_eff', 'FTAG', 'AS', 'AG_eff']].describe()

c:\Users\msassrb2\Anaconda3\lib\site-packages\numpy\lib\function_base.py:4009: RuntimeWarning: invalid value encountered in subtract
  diff_b_a = subtract(b, a)


df_matches['HG_eff'] = df_matches['FTHG']/df_matches['HS'] 
df_matches['AG_eff'] = df_matches['FTAG']/df_matches['AS'] 
df_matches[['FTHG', 'HS', 'HG_eff', 'FTAG', 'AS', 'AG_eff']].describe().round(2)   # round(2) forces rounding to 2 digits


df_matches[(df_matches['HS'] == 36)|(df_matches['AS'] == 31)]


p1 = ggplot(df_matches, aes(x = 'HS', y = 'HG_eff')) + geom_point()
p1


temp = df_teams.query("Team == 'Liverpool' or Team == 'Tottenham'").copy()
tab3 = temp.groupby('Team')['FTG'].agg(np.mean)
tab3

Team
Arsenal                  NaN
Aston Villa              NaN
Bournemouth              NaN
Brentford                NaN
Brighton                 NaN
Burnley                  NaN
Chelsea                  NaN
Crystal Palace           NaN
Everton                  NaN
Fulham                   NaN
Leeds                    NaN
Leicester                NaN
Liverpool           2.236842
Luton                    NaN
Man City                 NaN
Man United               NaN
Newcastle                NaN
Norwich                  NaN
Nott'm Forest            NaN
Sheffield United         NaN
Southampton              NaN
Tottenham           1.868421
Watford                  NaN
West Ham                 NaN
Wolves                   NaN
Name: FTG, dtype: float64


temp = df_teams.query("Team == 'Liverpool' or Team == 'Tottenham'").copy()
tab4 = temp.groupby('Team')['FTG'].agg(np.mean).dropna()  # .dropna() drops all rows with NA
tab4

Team
Liverpool    2.236842
Tottenham    1.868421
Name: FTG, dtype: float64


p2 = ggplot(temp, aes(x = 'FTG', fill = 'Team')) + geom_bar(position = "dodge") 
p2


temp = df_teams.query("Team == 'Chelsea'").sort_values('Date').copy()


temp['Order'] = temp.groupby(['Season','Opponent']).cumcount() + 1


tab4 = temp.groupby('Order')[['FTG','Shots','ShotsTarget']].agg(np.mean)
tab4


temp = df_teams.query("Team == 'Man City'").copy()
temp['Foul_compare'] = (temp['FoulsComm'] > temp['FoulsComm_Opp'])
p3 = ggplot(temp, aes(x = 'Foul_compare')) + geom_bar() 
p3

	FTHG	FTAG
count	1140.000000	1140.000000
mean	1.649123	1.334211
std	1.375073	1.244361
min	0.000000	0.000000
25%	1.000000	0.000000
50%	1.000000	1.000000
75%	2.000000	2.000000
max	9.000000	8.000000

	size	mean	var
Time
1900-01-01 12:00:00	3	1.666667	1.333333
1900-01-01 12:30:00	84	2.809524	3.095812
1900-01-01 13:00:00	4	2.500000	1.666667
1900-01-01 13:30:00	4	2.250000	1.583333
1900-01-01 14:00:00	168	2.940476	2.774879
1900-01-01 14:15:00	3	2.000000	1.000000
1900-01-01 15:00:00	387	3.043928	2.886666
1900-01-01 15:30:00	4	3.250000	0.916667
1900-01-01 15:45:00	1	2.000000	NaN
1900-01-01 16:00:00	21	3.666667	2.333333
1900-01-01 16:15:00	1	1.000000	NaN
1900-01-01 16:30:00	89	3.370787	3.213228
1900-01-01 17:30:00	98	2.867347	3.023459
1900-01-01 18:00:00	2	2.000000	8.000000
1900-01-01 19:00:00	2	2.000000	2.000000
1900-01-01 19:30:00	61	2.737705	2.930055
1900-01-01 19:45:00	56	2.428571	2.249351
1900-01-01 20:00:00	127	3.055118	2.846144
1900-01-01 20:15:00	25	3.600000	3.583333

	size	mean	var
TimeofDay
noon	266	2.857143	2.809704
early afternoon	392	3.043367	2.860007
late afternoon	213	3.131455	3.114713
evening	269	2.903346	2.886145

	FTHG	HS	HG_eff	FTAG	AS	AG_eff
count	1140.000000	1140.000000	1140.000000	1140.000000	1140.000000	1140.0
mean	1.649123	14.400877	inf	1.334211	11.742982	inf
std	1.375073	5.888024	NaN	1.244361	5.264696	NaN
min	0.000000	1.000000	1.400000	0.000000	1.000000	1.0
25%	1.000000	10.000000	5.666667	0.000000	8.000000	5.5
50%	1.000000	14.000000	9.500000	1.000000	11.000000	9.5
75%	2.000000	18.000000	20.000000	2.000000	15.000000	NaN
max	9.000000	36.000000	inf	8.000000	31.000000	inf

	FTHG	HS	HG_eff	FTAG	AS	AG_eff
count	1140.00	1140.00	1140.00	1140.00	1140.00	1140.00
mean	1.65	14.40	0.12	1.33	11.74	0.12
std	1.38	5.89	0.10	1.24	5.26	0.12
min	0.00	1.00	0.00	0.00	1.00	0.00
25%	1.00	10.00	0.05	0.00	8.00	0.00
50%	1.00	14.00	0.11	1.00	11.00	0.11
75%	2.00	18.00	0.18	2.00	15.00	0.18
max	9.00	36.00	0.71	8.00	31.00	1.00

Simple data handling and analysis in Python - Football data¶

Prepare the workspace¶

Load the data¶

Explore the data¶

Data types¶

Analysis plan¶

Organise the data¶

Reshape the data¶

Analysis¶

Match-based questions¶

Team-based questions¶

Group by 'Opponent' and assign order number (1 for first, 2 for second)¶

Summary¶

	Div	Season	MatchNo	Date	Time	HomeTeam	AwayTeam	FTHG	FTAG	FTR	...	HC	AC	HY	AY	HR	AR	FTG	TimeofDay	HG_eff	AG_eff
362	E0	2122	2122_362	2022-05-15	1900-01-01 14:00:00	West Ham	Man City	2	2	D	...	4	9	3	1	0	0	4	noon	0.333333	0.064516
376	E0	2324	2324_376	2024-05-19	1900-01-01 16:00:00	Liverpool	Wolves	2	0	H	...	10	2	1	1	0	1	2	late afternoon	0.055556	0.000000

	FTG	Shots	ShotsTarget
Order
1	1.614035	13.491228	4.789474
2	1.736842	15.000000	5.456140