2017年NBA数据分析 前言获取数据数据分析数据相关性基本数据排名分析Seaborn常用的三个数据可视化方法单变量:双变量多变量 衍生变量的一些可视化实践-以年龄为例 球队数据分析球队薪资排行按照球队综合实力排名利用箱线图和小提琴图进行数据分析
前言
原始数据可以通过我分享的资源获取 NBA–2017年数据表
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
获取数据
data = pd.read_csv("./data/nba_2017_nba_players_with_salary.csv")
data.head()
data.shape
(342, 38)
# 粗略观察数据的各个统计值
data.describe()
RkAGEMPFGFGAFG%3P3PA3P%2P...GPMPGORPMDRPMRPMWINS_RPMPIEPACEWSALARY_MILLIONScount342.000000342.000000342.000000342.000000342.000000342.000000342.000000342.000000320.000000342.000000...342.000000342.000000342.000000342.000000342.000000342.000000342.000000342.000000342.000000342.000000mean217.26900626.44444421.5725153.4836267.7254390.4460960.8657892.4400580.3070162.620175...58.19883021.572807-0.676023-0.005789-0.6818132.8617259.18684298.34105328.9502927.294006std136.4031384.2956868.8040182.2008724.6469330.0789920.7800102.0217160.1346911.828714...22.2820158.8041212.0632371.6142932.5220143.8809143.5854752.87009114.6038766.516326min1.00000019.0000002.2000000.0000000.8000000.0000000.0000000.0000000.0000000.000000...2.0000002.200000-4.430000-3.920000-6.600000-2.320000-1.60000087.4600000.0000000.03000025%100.25000023.00000015.0250001.8000004.2250000.4022500.2000000.8000000.2802501.200000...43.50000015.025000-2.147500-1.222500-2.4225000.1025007.10000096.85000019.0000002.18500050%205.50000026.00000021.6500003.0000006.7000000.4420000.7000002.2000000.3405002.200000...66.00000021.650000-0.990000-0.130000-1.1700001.4100008.70000098.20500029.0000004.92000075%327.75000029.00000029.0750004.70000010.4000000.4810001.4000003.6000000.3735003.700000...76.00000029.0750000.2575001.0675000.8650004.48750010.900000100.06000039.00000011.110000max482.00000040.00000037.80000010.30000024.0000000.7500004.10000010.0000001.0000009.700000...82.00000037.8000007.2700006.0200008.42000020.43000023.000000109.87000066.00000030.960000
8 rows × 35 columns
数据分析
数据相关性
data_cor = data.loc[:, ['RPM', 'AGE', 'SALARY_MILLIONS', 'ORB', 'DRB', 'TRB','AST', 'STL','BLK', 'TOV', 'PF', 'POINTS', 'GP', 'MPG', 'ORPM', 'DRPM']]data_cor.head()
RPMAGESALARY_MILLIONSORBDRBTRBASTSTLBLKTOVPFPOINTSGPMPGORPMDRPM06.272826.501.79.010.710.41.60.45.42.331.68134.66.74-0.4714.812726.501.27.08.111.21.50.55.72.729.18136.46.38-1.5721.83276.590.62.12.75.90.90.22.82.228.97633.85.72-3.8934.352322.122.39.511.82.11.32.22.42.228.07536.10.453.9044.202616.962.18.911.04.61.41.33.73.927.07234.23.560.64
# 获取两列数据之间的相关性
corr = data_cor.corr()
corr.head()
RPMAGESALARY_MILLIONSORBDRBTRBASTSTLBLKTOVPFPOINTSGPMPGORPMDRPMRPM1.0000000.1758200.4775420.3887640.6235150.5878530.4819710.5990080.4630970.4920140.4342260.6044320.3408100.5494490.7698220.578388AGE0.1758201.0000000.353312-0.0157520.0888590.0620640.1149080.069892-0.0629170.0306730.0055120.0314220.0518630.0996570.1361770.100636SALARY_MILLIONS0.4775420.3533121.0000000.2649540.5315690.4820880.4861590.4467630.2602880.5369930.3415120.6354250.3480930.5941620.5036820.102307ORB0.388764-0.0157520.2649541.0000000.7313450.861103-0.0116320.1690750.6542650.2746700.5579570.2849080.2969750.3421400.1021130.476857DRB0.6235150.0888590.5315690.7313451.0000000.9762440.3507860.4857260.6607330.5980430.6707080.6482670.4733760.6846620.4284330.426536
# 创建画布
plt.figure(figsize=(20, 8), dpi=100)# 画出相关性热图
# param1: 数据
# param2; 正方形
# param3: 线宽
# param4: 显示值
sns.heatmap(corr, square=True, linewidths=0.1, annot=True)
基本数据排名分析
# 按照效率值排名data.loc[:, ["PLAYER", "RPM","AGE"]].sort_values(by="RPM", ascending=False).head()
PLAYERRPMAGE6LeBron James8.423237Chris Paul7.92318Stephen Curry7.4128120Draymond Green7.14267Kawhi Leonard7.0825
# 按照球员薪资排名data.loc[:, ["PLAYER", "RPM", "AGE", "SALARY_MILLIONS"]].sort_values(by="SALARY_MILLIONS", ascending=False).head()
PLAYERRPMAGESALARY_MILLIONS6LeBron James8.423230.9625Mike Conley4.472926.5467Al Horford1.823026.540Russell Westbrook6.272826.501James Harden4.812726.50
Seaborn常用的三个数据可视化方法
单变量:
# 利用seaborn中的distplot绘图来
#分别看一下球员薪水、效率值、年龄这三个信息的分布情况# 设置显示风格
sns.set_style("darkgrid")# 设置画布
plt.figure(figsize=(10, 10))# 分割屏幕 -- 薪水
plt.subplot(3, 1, 1)
sns.distplot(data["SALARY_MILLIONS"])
plt.ylabel("salary")# 分割屏幕 -- 效率值(真实贡献值)
plt.subplot(3, 1, 2)
sns.distplot(data["RPM"])
plt.ylabel("RPM")# 分割屏幕 -- 年龄
plt.subplot(3, 1, 3)
sns.distplot(data["AGE"])
plt.ylabel("AGE")
双变量
sns.jointplot(data.AGE, data.SALARY_MILLIONS, kind="hex")
多变量
multi_data = data.loc[:, ['RPM','SALARY_MILLIONS','AGE','POINTS']]
multi_data.head()
RPMSALARY_MILLIONSAGEPOINTS06.2726.502831.614.8126.502729.121.836.592728.934.3522.122328.044.2016.962627.0
# 多变量两两做一个表
sns.pairplot(multi_data)
衍生变量的一些可视化实践-以年龄为例
def age_cut(df):"""年龄划分"""if df.AGE <= 24:return "young"elif df.AGE >= 30:return "old"else:return "best"
# 使用apply对年龄进行划分
# 函数作为一个对象,能作为参数传递给其它参数,并且能作为函数的返回值
# 循环data的每一个值, 带入age_cut求值, 得到的结果赋给age_cut colunm
data["age_cut"] = data.apply(lambda x:age_cut(x), axis=1)
data.head()
RkPLAYERPOSITIONAGEMPFGFGAFG%3P3PA...MPGORPMDRPMRPMWINS_RPMPIEPACEWSALARY_MILLIONSage_cut01Russell WestbrookPG2834.610.224.00.4252.57.2...34.66.74-0.476.2717.3423.0102.314626.50best12James HardenPG2736.48.318.90.4403.29.3...36.46.38-1.574.8115.5419.0102.985426.50best23Isaiah ThomasPG2733.89.019.40.4633.28.5...33.85.72-3.891.838.1916.199.84516.59best34Anthony DavisC2336.110.320.30.5050.51.8...36.10.453.904.3512.8119.2100.193122.12young46DeMarcus CousinsC2634.29.019.90.4521.85.0...34.23.560.644.2011.2617.897.113016.96best
5 rows × 39 columns
# 方便计数
data["cut"] = 1
data.loc[data.age_cut == "best"].SALARY_MILLIONS.head()
0 26.50
1 26.50
2 6.59
4 16.96
5 24.33
Name: SALARY_MILLIONS, dtype: float64
# 基于年龄段对球员薪水和效率值进行分析
sns.set_style("darkgrid")
plt.figure(figsize=(10,10), dpi=100)
plt.title("RPM and Salary")x1 = data.loc[data.age_cut == "old"].SALARY_MILLIONS
y1 = data.loc[data.age_cut == "old"].RPM
plt.plot(x1, y1, "^")x2 = data.loc[data.age_cut == "best"].SALARY_MILLIONS
y2 = data.loc[data.age_cut == "best"].RPM
plt.plot(x2, y2, "^")x3 = data.loc[data.age_cut == "young"].SALARY_MILLIONS
y3 = data.loc[data.age_cut == "young"].RPM
plt.plot(x3, y3, ".")
# 取出多个变量画图
multi_data2 = data.loc[:, ['RPM','POINTS','TRB','AST','STL','BLK','age_cut']]
# 用hue来指定对应colunm所有取值中每一种对应的颜色
sns.pairplot(multi_data2, hue="age_cut")
球队数据分析
球队薪资排行
# .agg() 聚合方法 -- 字典
data.groupby(by="age_cut").agg({"SALARY_MILLIONS":np.max})
SALARY_MILLIONSage_cutbest26.54old30.96young22.12
# 按照球队进行分类
data_team = data.groupby(by="TEAM").agg({"SALARY_MILLIONS":np.mean})# 按照薪资进行分类, 降序排列
data_team.sort_values(by="SALARY_MILLIONS", ascending=False).head(10)
SALARY_MILLIONSTEAMCLE17.095000HOU13.432000GS12.701429ORL/TOR11.125000POR9.730000WSH9.628889ORL9.490000MIL/CHA9.425000SA9.347273NO/SAC8.970000
# 按照分球队分年龄段,上榜球员降序排列,
# 如上榜球员数相同,则按效率值降序排列。
data_rpm = data.groupby(by=["TEAM","age_cut"]).agg({"SALARY_MILLIONS": np.mean,"RPM": np.mean, "PLAYER": np.size})
data_rpm.sort_values(by=["PLAYER", "RPM"], ascending=False).head()
data_rpm.head()
SALARY_MILLIONSRPMPLAYERTEAMage_cutATLbest4.678000-1.7680005old12.7750000.9825004young1.926667-3.0766673ATL/CLEold5.040000-2.4850002ATL/PHI/OKCbest8.4000001.7200001
按照球队综合实力排名
data_rpm2 = data.groupby(by=['TEAM'],as_index=False).agg({'SALARY_MILLIONS': np.mean,'RPM': np.mean,'PLAYER': np.size,'POINTS': np.mean,'eFG%': np.mean,'MPG': np.mean,'AGE': np.mean})
data_rpm2.head()
TEAMSALARY_MILLIONSRPMPLAYERPOINTSeFG%MPGAGE0ATL6.689167-1.178333127.4166670.44266718.54166727.0000001ATL/CLE5.040000-2.48500027.6500000.58200021.05000035.5000002ATL/PHI/OKC8.4000001.720000113.1000000.51100026.10000029.0000003BKN5.704545-1.224545119.0454550.48727320.22727327.6363644BKN/WSH4.910000-4.04500028.1500000.47000017.35000027.000000
# 按照效率值降序排列
data_rpm2.sort_values(by="RPM", ascending=False).head()
TEAMSALARY_MILLIONSRPMPLAYERPOINTSeFG%MPGAGE18GS12.7014293.478571714.5285710.57514326.70000028.7142869CLE17.0950002.566667615.8833330.55583329.76666728.0000002ATL/PHI/OKC8.4000001.720000113.1000000.51100026.10000029.00000020HOU13.4320001.582000515.4200000.53460029.98000027.20000044SA9.3472730.901818119.8181820.52418221.47272729.545455
利用箱线图和小提琴图进行数据分析
# 筛选数据
data.TEAM.isin(['GS', 'CLE', 'SA', 'LAC','OKC', 'UTAH', 'CHA', 'TOR', 'NO', 'BOS']).head()
0 True
1 False
2 True
3 True
4 False
Name: TEAM, dtype: bool
# 箱线图#设置图片背景
sns.set_style("whitegrid")
plt.figure(figsize=(20, 10))# 获取需要的数据
data_team2 = data[data.TEAM.isin(['GS', 'CLE', 'SA', 'LAC','OKC', 'UTAH', 'CHA', 'TOR', 'NO', 'BOS'])]# 进行相应的绘图\# 年薪
plt.subplot(3,1,1)
sns.boxplot(x="TEAM", y="SALARY_MILLIONS", data = data_team2)# 年龄
plt.subplot(3,1,2)
sns.boxplot(x="TEAM", y="AGE", data = data_team2)# 场均上场时间
plt.subplot(3,1,3)
sns.boxplot(x="TEAM", y="MPG", data = data_team2)
# 绘制小提琴图# 设置图背景
sns.set_style("whitegrid")
plt.figure(figsize=(20, 10))# 三分命中率
plt.subplot(3,1,1)
sns.violinplot(x="TEAM", y="3P%", data=data_team2)# 有效命中率
plt.subplot(3,1,2)
sns.violinplot(x="TEAM", y="eFG%", data=data_team2)# 得分
plt.subplot(3,1,3)
sns.violinplot(x="TEAM", y="POINTS", data=data_team2)