from IPython import display
display.Image("Fifa.png")


import requests
import numpy as np
import pandas as pd
from ggplot import *
import seaborn as sns
from bs4 import BeautifulSoup
import urllib.request
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import sklearn.linear_model as lm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import statsmodels.api as sm
import os
import plotly.graph_objects as go
from plotly.subplots import make_subplots


# Reading each file into dataframes for preparation and analysis.
data_21 = pd.read_csv('players_21.csv')

# Prinding out first 50 rows in dataframe that has Fifa 2021 dataset.
# Each dataset has the same columns and same information. So looking at one dataset will show what others will look like.
data_21.head()


# Printing out the number of columns in the dataframe.
print("Number of Columns: " + str(len(data_21.columns)))

Number of Columns: 106


# Using iloc() function from Pandas Library to reformat datasets so that that only contain information we need. 
data_21 = data_21.iloc[:, 0:75]

# Displaying head of the new dataset.
data_21.head()


# List of column names we want to drop
lst = ['value_eur','player_url', 'wage_eur', 'body_type', 'skill_moves', 'weak_foot', 'nation_jersey_number', 'nation_position', 'international_reputation', 'work_rate','real_face','release_clause_eur','loaned_from', 'team_position','player_tags', 'team_jersey_number','contract_valid_until', 'loaned_from', 'joined', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning', 'player_traits', 'defending_marking', 'defending_standing_tackle', 'defending_sliding_tackle']

# Dropping all the columns with names corresponding to strings in lst that was initialized above. 
data_21.drop(lst, inplace = True, axis = 1)
data_21


# Our method of finding the top 50 players from FIFA2021 is by using the 'overall' ranking of the player and sorting them.
df = data_21.sort_values(by='overall',ascending=False).head(200)

# Iterrating through each row and column in the dataframe, checking if the player position is a GK to identify goalkeepers and remove them.
for index, row in df.iterrows():
    # Conditional that checks each player if they are a Goal Keeper or not. 
    if (row['player_positions'] == 'GK'):
        # The .drop() function is a very useful pandas datafrem functionality that is used for data cleaning.
        # Drop can be used to get rid of specific columns/rows which can deem itself very useful and efficient for eliminating useless data. 
        df = df.drop(index, axis=0)
        
df.head()


df.head()


# Sorting the values based off of player overall rank from highest to lowest to see order of best to worst player. 
df.sort_values(by='overall',ascending=False)[['short_name','age','overall']]


df2 = df.sort_values(by='overall',ascending=False)[['overall', 'pace', 'shooting', 'dribbling', 'defending', 'physic', 'mentality_aggression', \
                                                    'mentality_positioning','mentality_vision', 'mentality_composure']]
plt.figure(figsize=(9,6))
corrMatrix = df2.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()


# Seting size of graph we are about to produce
fig, ax = plt.subplots(figsize=(15,8))
# Setting x and y variables to hold focus data
x = df['shooting']
y = df['mentality_vision']
# Creating a scatter plot using the x and y variables we created
ax.plot(x, y, 'o')
# Here, we are using np.poly fit to add a regression line that shows how the Overall Rank changes as the Shooting Score Changes.
# m represents the slope and b represents the intercept of the linear regression line. 
m, b = np.polyfit(x,y, 1)
# Plotting/adding the regression line.
plt.plot(x, m*x+b, color = 'red')

# This loop goes through each player in the list, and looks for the top 5 ranked players.
# This will help narrow down on the variance of skill and comparing players that are around the same skillset as Messi will be a more effective analysis. 
for idx, row in df.iterrows():
    name = row['short_name']
    x = row['shooting']
    y = row['mentality_vision']
    if(name == 'L. Messi' or name == 'Cristiano Ronaldo' or \
       name == 'R. Lewandowski' or name == 'Neymar Jr' or name == 'K. De Bruyne'):
        ax.annotate(name, (x, y))

# Setting the proper x-axis/y-axis labels and the title.
plt.xlabel('Shooting  Rank')
plt.ylabel('Menality Vision Rank')
plt.title('Shooting Rank vs Menality Vision Rank')
plt.show()


# Seting size of graph we are about to produce
fig, ax = plt.subplots(figsize=(15,8))
# Setting x and y variables to hold focus data
x = df['passing']
y = df['mentality_vision']
# Creating a scatter plot using the x and y variables we created
ax.plot(x, y, 'o')
# Here, we are using np.poly fit to add a regression line that shows how the Passing Score changes as the mentality_vision Score Changes.
# m represents the slope and b represents the intercept of the linear regression line. 
m, b = np.polyfit(x,y, 1)
# Plotting/adding the regression line.
plt.plot(x, m*x+b, color = 'red')

# This loop goes through each player in the list, and looks for the top 5 ranked players.
# This will help narrow down on the variance of skill and comparing players that are around the same skillset as Messi will be a more effective analysis. 
for idx, row in df.iterrows():
    name = row['short_name']
    x = row['passing']
    y = row['mentality_vision']
    if(name == 'L. Messi' or name == 'Cristiano Ronaldo' or \
       name == 'R. Lewandowski' or name == 'Neymar Jr' or name == 'K. De Bruyne'):
        ax.annotate(name, (x, y))

# Setting the proper x-axis/y-axis labels and the title.
plt.xlabel('Passing  Rank')
plt.ylabel('Mentality Vision Rank')
plt.title('Passing Score vs Mentality Vision')
plt.show()


# Seting size of graph we are about to produce
fig, ax = plt.subplots(figsize=(15,8))
# Setting x and y variables to hold focus data
x = df['passing']
y = df['shooting']
# Creating a scatter plot using the x and y variables we created
ax.plot(x, y, 'o')
# Here, we are using np.poly fit to add a regression line that shows how the Shooting Score changes as the Passing Score Changes.
# m represents the slope and b represents the intercept of the linear regression line. 
m, b = np.polyfit(x,y, 1)
# Plotting/adding the regression line.
plt.plot(x, m*x+b, color = 'red')

# This loop goes through each player in the list, and looks for the top 5 ranked players.
# This will help narrow down on the variance of skill and comparing players that are around the same skillset as Messi will be a more effective analysis. 
for idx, row in df.iterrows():
    name = row['short_name']
    x = row['passing']
    y = row['shooting']
    if(name == 'L. Messi' or name == 'Cristiano Ronaldo' or \
       name == 'R. Lewandowski' or name == 'Neymar Jr' or name == 'K. De Bruyne'):
        ax.annotate(name, (x, y))

# Setting the proper x-axis/y-axis labels and the title.
plt.xlabel('Passing Rank')
plt.ylabel('Shooting Rank')
plt.title('Passing Rank vs Shooting Rank')
plt.show()


# Seting size of graph we are about to produce
fig, ax = plt.subplots(figsize=(15,8))
# Setting x and y variables to hold focus data
x = df['dribbling']
y = df['mentality_vision']
# Creating a scatter plot using the x and y variables we created
ax.plot(x, y, 'o')
# Here, we are using np.poly fit to add a regression line that shows how the Dribbling Rank changes as the mentality_vision Score Changes.
# m represents the slope and b represents the intercept of the linear regression line. 
m, b = np.polyfit(x,y, 1)
# Plotting/adding the regression line.
plt.plot(x, m*x+b, color = 'red')

for idx, row in df.iterrows():
    name = row['short_name']
    x = row['dribbling']
    y = row['mentality_vision']
    if(name == 'L. Messi' or name == 'Cristiano Ronaldo' or \
       name == 'R. Lewandowski' or name == 'Neymar Jr' or name == 'K. De Bruyne'):
        ax.annotate(name, (x, y))

# Setting the proper x-axis/y-axis labels and the title.
plt.xlabel('Dribbling Score')
plt.ylabel('Mentality Vision Rank')
plt.title('Dribbling Rank vs Mentality Vision Rank')
plt.show()


# This is a list that has all of the attributes and skills that we want to plot for each player.
skills = ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'attacking_crossing',
         'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing',
         'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy',
         'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed',
         'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power',
         'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots']

# Here, we are creating the two dataframes for the players Messi and Ronaldo to be plotted together.
messi = df.loc[df.short_name == 'L. Messi']
messi = pd.DataFrame(messi, columns=skills)
ronaldo = df.loc[df.short_name == 'Cristiano Ronaldo']
ronaldo = pd.DataFrame(ronaldo, columns=skills)

# Setting the figure size.
plt.figure(figsize=(15,8))
# Messi's line will be Blue, and Ronaldo's line will be green.
sns.pointplot(data=messi,color='blue',alpha=0.6)
sns.pointplot(data=ronaldo, color='green', alpha=0.6)
plt.xticks(rotation=90)
# labeling proper axis
plt.xlabel('Skillset', fontsize=15)
plt.ylabel('Skill Rank', fontsize=15)
plt.title('Messi vs Ronaldo', fontsize = 20)
plt.grid()


# Here, we are creating the dataframes for the players Neymar.
neymar = df.loc[df.short_name == 'Neymar Jr']
neymar = pd.DataFrame(neymar, columns=skills)

# Setting the figure size.
plt.figure(figsize=(15,8))
sns.pointplot(data=messi,color='blue',alpha=0.5)
sns.pointplot(data=neymar, color='yellow', alpha=0.5)
plt.xticks(rotation=90)

# labeling proper axis
plt.xlabel('Skillset', fontsize=15)
plt.ylabel('Skill Ranking', fontsize=15)
plt.title('Messi vs Neymar', fontsize = 20)
plt.grid()


lewand = df.loc[df.short_name == 'R. Lewandowski']
lewand = pd.DataFrame(lewand, columns=skills)

plt.figure(figsize=(15,8))
sns.pointplot(data=messi,color='blue',alpha=0.6)
sns.pointplot(data=lewand, color='red', alpha=0.6)
plt.xticks(rotation=90)
plt.xlabel('Skillset', fontsize=15)
plt.ylabel('Skill Ranking', fontsize=15)
plt.title('Messi vs Lewandowski', fontsize = 20)
plt.grid()


# Grabbing the independent variables which isn this case are {age, pace, shooting, passing, dribbling, defending, physic, etc}
x = df.iloc[:,[3,15,16,17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 43, 44]]
x.head()


# Grabbing the dependent variable which is Overall Ranking of a player
y=df.iloc[:,11]
y.head()

0    93
1    92
3    91
4    91
5    91
Name: overall, dtype: int64


# train_test_split us a function from the SKlearn python library what splits up arrays into two subsets, one for training a linear regression model and one for testing the regression model. 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
linreg=LinearRegression()
linreg.fit(x_train,y_train)
linreg.predict(x_test)
OLS=sm.OLS(endog=y,exog=x).fit()
OLS.summary()

	sofifa_id	player_url	short_name	long_name	age	dob	height_cm	weight_kg	nationality	club_name	...	lwb	ldm	cdm	rdm	rwb	lb	lcb	cb	rcb	rb
0	158023	https://sofifa.com/player/158023/lionel-messi/...	L. Messi	Lionel Andrés Messi Cuccittini	33	1987-06-24	170	72	Argentina	FC Barcelona	...	66+3	65+3	65+3	65+3	66+3	62+3	52+3	52+3	52+3	62+3
1	20801	https://sofifa.com/player/20801/c-ronaldo-dos-...	Cristiano Ronaldo	Cristiano Ronaldo dos Santos Aveiro	35	1985-02-05	187	83	Portugal	Juventus	...	65+3	61+3	61+3	61+3	65+3	61+3	54+3	54+3	54+3	61+3
2	200389	https://sofifa.com/player/200389/jan-oblak/210002	J. Oblak	Jan Oblak	27	1993-01-07	188	87	Slovenia	Atlético Madrid	...	32+3	36+3	36+3	36+3	32+3	32+3	33+3	33+3	33+3	32+3
3	188545	https://sofifa.com/player/188545/robert-lewand...	R. Lewandowski	Robert Lewandowski	31	1988-08-21	184	80	Poland	FC Bayern München	...	64+3	65+3	65+3	65+3	64+3	61+3	60+3	60+3	60+3	61+3
4	190871	https://sofifa.com/player/190871/neymar-da-sil...	Neymar Jr	Neymar da Silva Santos Júnior	28	1992-02-05	175	68	Brazil	Paris Saint-Germain	...	67+3	62+3	62+3	62+3	67+3	62+3	49+3	49+3	49+3	62+3

	sofifa_id	player_url	short_name	long_name	age	dob	height_cm	weight_kg	nationality	club_name	...	power_long_shots	mentality_aggression	mentality_interceptions	mentality_positioning	mentality_vision	mentality_penalties	mentality_composure	defending_marking	defending_standing_tackle	defending_sliding_tackle
0	158023	https://sofifa.com/player/158023/lionel-messi/...	L. Messi	Lionel Andrés Messi Cuccittini	33	1987-06-24	170	72	Argentina	FC Barcelona	...	94	44	40	93	95	75	96	NaN	35	24
1	20801	https://sofifa.com/player/20801/c-ronaldo-dos-...	Cristiano Ronaldo	Cristiano Ronaldo dos Santos Aveiro	35	1985-02-05	187	83	Portugal	Juventus	...	93	63	29	95	82	84	95	NaN	32	24
2	200389	https://sofifa.com/player/200389/jan-oblak/210002	J. Oblak	Jan Oblak	27	1993-01-07	188	87	Slovenia	Atlético Madrid	...	12	34	19	11	65	11	68	NaN	12	18
3	188545	https://sofifa.com/player/188545/robert-lewand...	R. Lewandowski	Robert Lewandowski	31	1988-08-21	184	80	Poland	FC Bayern München	...	85	81	49	94	79	88	88	NaN	42	19
4	190871	https://sofifa.com/player/190871/neymar-da-sil...	Neymar Jr	Neymar da Silva Santos Júnior	28	1992-02-05	175	68	Brazil	Paris Saint-Germain	...	84	51	36	87	90	92	93	NaN	30	29

	sofifa_id	short_name	long_name	age	dob	height_cm	weight_kg	nationality	club_name	league_name	...	power_jumping	power_stamina	power_strength	power_long_shots	mentality_aggression	mentality_interceptions	mentality_positioning	mentality_vision	mentality_penalties	mentality_composure
0	158023	L. Messi	Lionel Andrés Messi Cuccittini	33	1987-06-24	170	72	Argentina	FC Barcelona	Spain Primera Division	...	68	72	69	94	44	40	93	95	75	96
1	20801	Cristiano Ronaldo	Cristiano Ronaldo dos Santos Aveiro	35	1985-02-05	187	83	Portugal	Juventus	Italian Serie A	...	95	84	78	93	63	29	95	82	84	95
2	200389	J. Oblak	Jan Oblak	27	1993-01-07	188	87	Slovenia	Atlético Madrid	Spain Primera Division	...	78	41	78	12	34	19	11	65	11	68
3	188545	R. Lewandowski	Robert Lewandowski	31	1988-08-21	184	80	Poland	FC Bayern München	German 1. Bundesliga	...	84	76	86	85	81	49	94	79	88	88
4	190871	Neymar Jr	Neymar da Silva Santos Júnior	28	1992-02-05	175	68	Brazil	Paris Saint-Germain	French Ligue 1	...	62	81	50	84	51	36	87	90	92	93
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
18939	256679	K. Angulo	Kevin Angulo	24	1996-04-13	176	73	Colombia	América de Cali	Colombian Liga Postobón	...	57	40	51	36	57	41	31	49	39	36
18940	257710	Zhang Mengxuan	Mengxuan Zhang	21	1999-04-26	177	70	China PR	Chongqing Dangdai Lifan FC SWM Team	Chinese Super League	...	65	55	45	23	44	50	25	25	35	40
18941	250989	Wang Zhenghao	王政豪	20	2000-06-28	185	74	China PR	Tianjin TEDA FC	Chinese Super League	...	69	58	55	22	46	45	25	25	35	40
18942	257697	Chen Zitong	Zitong Chen	23	1997-02-20	186	80	China PR	Shijiazhuang Ever Bright F.C.	Chinese Super League	...	56	48	63	33	56	40	31	45	42	43
18943	257936	Song Yue	Yue Song	28	1991-11-20	185	79	China PR	Tianjin TEDA FC	Chinese Super League	...	59	54	62	34	57	35	48	44	36	35

	sofifa_id	short_name	long_name	age	dob	height_cm	weight_kg	nationality	club_name	league_name	...	power_jumping	power_stamina	power_strength	power_long_shots	mentality_aggression	mentality_interceptions	mentality_positioning	mentality_vision	mentality_penalties	mentality_composure
0	158023	L. Messi	Lionel Andrés Messi Cuccittini	33	1987-06-24	170	72	Argentina	FC Barcelona	Spain Primera Division	...	68	72	69	94	44	40	93	95	75	96
1	20801	Cristiano Ronaldo	Cristiano Ronaldo dos Santos Aveiro	35	1985-02-05	187	83	Portugal	Juventus	Italian Serie A	...	95	84	78	93	63	29	95	82	84	95
3	188545	R. Lewandowski	Robert Lewandowski	31	1988-08-21	184	80	Poland	FC Bayern München	German 1. Bundesliga	...	84	76	86	85	81	49	94	79	88	88
4	190871	Neymar Jr	Neymar da Silva Santos Júnior	28	1992-02-05	175	68	Brazil	Paris Saint-Germain	French Ligue 1	...	62	81	50	84	51	36	87	90	92	93
5	192985	K. De Bruyne	Kevin De Bruyne	29	1991-06-28	181	70	Belgium	Manchester City	English Premier League	...	63	89	74	91	76	66	88	94	84	91

	sofifa_id	short_name	long_name	age	dob	height_cm	weight_kg	nationality	club_name	league_name	...	power_jumping	power_stamina	power_strength	power_long_shots	mentality_aggression	mentality_interceptions	mentality_positioning	mentality_vision	mentality_penalties	mentality_composure
0	158023	L. Messi	Lionel Andrés Messi Cuccittini	33	1987-06-24	170	72	Argentina	FC Barcelona	Spain Primera Division	...	68	72	69	94	44	40	93	95	75	96
1	20801	Cristiano Ronaldo	Cristiano Ronaldo dos Santos Aveiro	35	1985-02-05	187	83	Portugal	Juventus	Italian Serie A	...	95	84	78	93	63	29	95	82	84	95
3	188545	R. Lewandowski	Robert Lewandowski	31	1988-08-21	184	80	Poland	FC Bayern München	German 1. Bundesliga	...	84	76	86	85	81	49	94	79	88	88
4	190871	Neymar Jr	Neymar da Silva Santos Júnior	28	1992-02-05	175	68	Brazil	Paris Saint-Germain	French Ligue 1	...	62	81	50	84	51	36	87	90	92	93
5	192985	K. De Bruyne	Kevin De Bruyne	29	1991-06-28	181	70	Belgium	Manchester City	English Premier League	...	63	89	74	91	76	66	88	94	84	91

FIFA 2021 Player Analysis¶

By: Nathan Chung¶

Outline¶

Introduction¶

Background Information¶

Set Up¶

Data Collection/Data Management¶

What is the data?¶

Getting Data¶

Data Cleaning¶

More Data Cleaning¶

Exploratory Data Analysis¶

Understanding the Data¶

Creating Useful Visualizations¶

Another form of visualization¶

Machine Learning/Linear Regression¶

Linear Regression¶

How to Interpret the results from running linear regression?¶

Reading the results from linear regression¶

Conclusion¶

	age	pace	shooting	passing	dribbling	defending	physic	attacking_crossing	attacking_finishing	attacking_heading_accuracy	attacking_short_passing	attacking_volleys	skill_dribbling	skill_curve	skill_fk_accuracy	skill_long_passing	skill_ball_control	mentality_positioning	mentality_vision
0	33	85.0	92.0	91.0	95.0	38.0	65.0	85	95	70	91	88	96	93	94	91	96	93	95
1	35	89.0	93.0	81.0	89.0	35.0	77.0	84	95	90	82	86	88	81	76	77	92	95	82
3	31	78.0	91.0	78.0	85.0	43.0	82.0	71	94	85	84	89	85	79	85	70	88	94	79
4	28	91.0	85.0	86.0	94.0	36.0	59.0	85	87	62	87	87	95	88	89	81	95	87	90
5	29	76.0	86.0	93.0	88.0	64.0	78.0	94	82	55	94	82	88	85	83	93	92	88	94

Dep. Variable:	overall	R-squared (uncentered):	0.999
Model:	OLS	Adj. R-squared (uncentered):	0.999
Method:	Least Squares	F-statistic:	1.379e+04
Date:	Mon, 17 May 2021	Prob (F-statistic):	1.05e-239
Time:	12:20:35	Log-Likelihood:	-373.25
No. Observations:	174	AIC:	784.5
Df Residuals:	155	BIC:	844.5
Df Model:	19
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
age	0.2661	0.051	5.229	0.000	0.166	0.367
pace	0.1546	0.025	6.188	0.000	0.105	0.204
shooting	0.1040	0.077	1.343	0.181	-0.049	0.257
passing	0.5677	0.652	0.871	0.385	-0.721	1.856
dribbling	0.4292	0.196	2.189	0.030	0.042	0.816
defending	-0.0155	0.020	-0.766	0.445	-0.055	0.024
physic	0.1299	0.034	3.819	0.000	0.063	0.197
attacking_crossing	-0.1127	0.134	-0.844	0.400	-0.376	0.151
attacking_finishing	0.0105	0.055	0.190	0.850	-0.099	0.120
attacking_heading_accuracy	0.0700	0.020	3.461	0.001	0.030	0.110
attacking_short_passing	0.1823	0.244	0.747	0.456	-0.300	0.664
attacking_volleys	-0.0285	0.028	-1.017	0.311	-0.084	0.027
skill_dribbling	-0.3443	0.123	-2.809	0.006	-0.586	-0.102
skill_curve	-0.0571	0.049	-1.159	0.248	-0.154	0.040
skill_fk_accuracy	-0.0552	0.037	-1.495	0.137	-0.128	0.018
skill_long_passing	-0.1023	0.105	-0.976	0.331	-0.309	0.105
skill_ball_control	0.3058	0.107	2.869	0.005	0.095	0.516
mentality_positioning	-0.1577	0.037	-4.237	0.000	-0.231	-0.084
mentality_vision	-0.1259	0.136	-0.923	0.357	-0.395	0.143

Omnibus:	8.103	Durbin-Watson:	1.517
Prob(Omnibus):	0.017	Jarque-Bera (JB):	13.540
Skew:	0.175	Prob(JB):	0.00115
Kurtosis:	4.321	Cond. No.	1.39e+03