import pandas as pd
import seaborn as sns
from statsmodels.stats.proportion import proportions_ztest
import matplotlib.pyplot as plt

economy_df = pd.read_csv("economy.csv", low_memory=False)
results_df = pd.read_csv("results.csv")
players_df = pd.read_csv("players.csv")
picks_df = pd.read_csv("picks.csv")

essential_columns = [
    'date', 'player_name', 'team', 'opponent', 'country',
    'player_id', 'match_id', 'event_id', 'event_name', 'best_of',
    'kills', 'assists', 'deaths', 'hs', 'flash_assists',
    'kast', 'kddiff', 'adr', 'fkdiff', 'rating',
    'kills_ct', 'deaths_ct', 'kddiff_ct', 'adr_ct', 'kast_ct', 'rating_ct',
    'kills_t', 'deaths_t', 'kddiff_t', 'adr_t', 'kast_t', 'rating_t'
]

players_df = players_df[essential_columns].copy()
display(players_df)

essential_columns = [
    'date', 'team_1', 'team_2', 'match_id', 'event_id', 'best_of',
    't1_removed_1', 't1_removed_2', 't1_removed_3',
    't2_removed_1', 't2_removed_2', 't2_removed_3',
    't1_picked_1', 't2_picked_1', 'left_over'
]
picks_df = picks_df[essential_columns].copy()
picks_df['date'] = pd.to_datetime(picks_df['date'], errors='coerce')
display(picks_df)

essential_columns = [
    'date', 'team_1', 'team_2', '_map',
    'ct_1', 't_1', 'ct_2', 't_2',
    'map_winner', 'starting_ct',
    'match_winner', 'event_id', 'match_id'
]

results_df = results_df[essential_columns].copy()
display(results_df)

# 2-Proportion-Z-Test
# Null Hypothesis: CT win rate = 50%
# Alternative Hypothesis: CT win rate ≠ 50%
# Also shows graph of rolling average CT win rates over time per each map (rolling avg window: 500 rounds)
results_df['date'] = pd.to_datetime(results_df['date'], errors='coerce')
results_df = results_df.dropna(subset=['date', '_map'])

ct_1 = results_df[['date', '_map', 'ct_1']].rename(columns={'ct_1': 'ct'})
ct_2 = results_df[['date', '_map', 'ct_2']].rename(columns={'ct_2': 'ct'})
t_1 = results_df[['date', '_map', 't_1']].rename(columns={'t_1': 't'})
t_2 = results_df[['date', '_map', 't_2']].rename(columns={'t_2': 't'})

ct = pd.concat([ct_1, ct_2]).sort_values('date').set_index('date')
t = pd.concat([t_1, t_2]).sort_values('date').set_index('date')

maps = ['Cache', 'Cobblestone', 'Dust2', 'Inferno', 'Mirage', 'Nuke', 'Overpass', 'Train', 'Vertigo']

rolling_ct_avg = {}
hypothesis_results = []

for map_name in maps:
    ct_map = ct[ct['_map'] == map_name]
    t_map = t[t['_map'] == map_name]

    ct_avg = ct_map['ct'].rolling(window=500, min_periods=20, center=True).sum()
    t_avg = t_map['t'].rolling(window=500, min_periods=20, center=True).sum()
    win_pct = (ct_avg / (ct_avg + t_avg)) * 100
    rolling_ct_avg[map_name] = win_pct

    ct_total = ct_map['ct'].sum()
    t_total = t_map['t'].sum()
    z_stat, p_val = proportions_ztest([ct_total, t_total], [ct_total + t_total, ct_total + t_total])

    hypothesis_results.append({
        'Map': map_name,
        'CT Round Wins': ct_total,
        'T Round Wins': t_total,
        'CT Round Win Rate Percentage': round((ct_total / (ct_total + t_total)) * 100, 2),
        'Z-Statistic': round(z_stat, 3),
        'p-value': p_val,
        'Significant? ': 'Yes' if p_val < 0.05 else 'No'
    })

hypothesis_df = pd.DataFrame(hypothesis_results).sort_values('CT Round Win Rate Percentage', ascending=False)
display(hypothesis_df)

plt.figure(figsize=(14, 7))

for map_name, win_pct_data in rolling_ct_avg.items():
    plt.plot(win_pct_data.index, win_pct_data.values, label=map_name, linewidth=1.5, alpha=0.9)

plt.axhline(50, color='black', linestyle='--', linewidth=1)
plt.xlabel('Date')
plt.ylabel('CT Round Win Percentage')
plt.title('CT Round Win Percentage Over Time by Map')
plt.legend(title='Map', loc='center left', bbox_to_anchor=(1, 0.5))
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

results_df['ct_start_win'] = results_df['starting_ct'] == results_df['map_winner']

rolling_window = 1500
plt.figure(figsize=(14, 7))

match_results = []

for map_name in maps:
  map_df = results_df[results_df['_map'] == map_name].sort_values('date')
  ct_wins = (map_df['ct_start_win'].sum())
  total = len(map_df)

  z_stat, p_val = proportions_ztest([ct_wins, total - ct_wins], [total, total])
  win_rate = map_df['ct_start_win'].rolling(window=rolling_window, min_periods=20, center=True).mean()

  match_results.append({
      'Map': map_name,
      'CT Start Wins': ct_wins,
      'Total Matches': total,
      'CT Start Win Rate Percentage': round((ct_wins / total) * 100, 2),
  })


  plt.plot(map_df['date'], win_rate * 100, label=map_name, alpha=0.9)

ct_match_df = pd.DataFrame(match_results).sort_values('CT Start Win Rate Percentage', ascending=False)
display(ct_match_df)

plt.axhline(50, color='gray', linestyle='--')
plt.xlabel("Date")
plt.ylabel("CT-starting Side Team Win Rate (%)")
plt.title(f"CT-Starting Side Team Match Win Rate (Rolling {rolling_window} Matches)")
plt.legend(title='Map', loc='center left', bbox_to_anchor=(1, 0.5))
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10,6))
plt.scatter(players_df['adr'], players_df['rating'], alpha=0.6, s=0.3)
plt.title("Pro Player ADR vs Rating")
plt.xlabel("ADR (Average Damage per Round)")
plt.ylabel("Rating")
plt.grid(True)

plt.show()

correlation = players_df['adr'].corr(players_df['rating'])

display(f"Pearson correlation of ADR vs Rating: {correlation:.3f}")

'Pearson correlation of ADR vs Rating: 0.880'

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, confusion_matrix
import numpy as np

# choose all columns but date and left_over
X = picks_df.drop(columns=['left_over', 'date', 'match_id', 'event_id'])

# one hot encoding
X = pd.get_dummies(X)

y = picks_df['left_over']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# get the top feature importances
importances = model.feature_importances_
indices = np.argsort(importances)[-10:]
top_feats = X.columns[indices]
top_vals = importances[indices]

# plot it
plt.figure(figsize=(10, 6))
plt.barh(top_feats, top_vals)
plt.xlabel("Feature Importance")
plt.title("Top 10 Feature Importances")
plt.tight_layout()
plt.show()

# show accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Random forest accuracy: {accuracy:.2%}")

Random forest accuracy: 74.31%

	date	player_name	team	opponent	country	player_id	match_id	event_id	event_name	best_of	...	kddiff_ct	adr_ct	kast_ct	rating_ct	kills_t	deaths_t	kddiff_t	adr_t	kast_t	rating_t
0	2020-02-26	Brehze	Evil Geniuses	Liquid	United States	9136	2339385	4901	IEM Katowice 2020	3	...	4.0	81.6	79.2	1.10	23.0	31.0	-8.0	77.5	60.0	0.97
1	2020-02-26	CeRq	Evil Geniuses	Liquid	Bulgaria	11219	2339385	4901	IEM Katowice 2020	3	...	12.0	77.4	72.9	1.16	17.0	29.0	-12.0	63.9	54.3	0.73
2	2020-02-26	EliGE	Liquid	Evil Geniuses	United States	8738	2339385	4901	IEM Katowice 2020	3	...	14.0	96.6	71.4	1.39	24.0	34.0	-10.0	64.2	64.6	0.86
3	2020-02-26	Ethan	Evil Geniuses	Liquid	United States	10671	2339385	4901	IEM Katowice 2020	3	...	10.0	74.0	75.0	1.11	10.0	31.0	-21.0	37.8	51.4	0.43
4	2020-02-26	NAF	Liquid	Evil Geniuses	Canada	8520	2339385	4901	IEM Katowice 2020	3	...	11.0	96.3	85.7	1.36	24.0	29.0	-5.0	61.0	70.8	0.87
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
383312	2015-10-07	kIMERA	ExAequo	RIP Fonty	Italy	7607	2298497	1957	Milan Games Week 2015 League by FACEIT	2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
383313	2015-10-07	morphiw0w	ExAequo	RIP Fonty	Italy	9752	2298497	1957	Milan Games Week 2015 League by FACEIT	2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
383314	2015-10-07	overfly	RIP Fonty	ExAequo	Italy	7698	2298497	1957	Milan Games Week 2015 League by FACEIT	2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
383315	2015-10-07	simozor	RIP Fonty	ExAequo	Italy	9753	2298497	1957	Milan Games Week 2015 League by FACEIT	2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
383316	2015-10-07	xullE	RIP Fonty	ExAequo	Italy	9754	2298497	1957	Milan Games Week 2015 League by FACEIT	2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	date	team_1	team_2	_map	ct_1	t_1	ct_2	t_2	map_winner	starting_ct	match_winner	event_id	match_id
0	2020-03-18	Recon 5	TeamOne	Dust2	0	0	15	1	2	2	2	5151	2340454
1	2020-03-18	Recon 5	TeamOne	Inferno	8	5	10	6	2	2	2	5151	2340454
2	2020-03-18	New England Whalers	Station7	Inferno	9	3	10	6	2	1	2	5243	2340461
3	2020-03-18	Rugratz	Bad News Bears	Inferno	0	7	8	8	2	2	2	5151	2340453
4	2020-03-18	Rugratz	Bad News Bears	Vertigo	4	4	11	5	2	2	2	5151	2340453
...	...	...	...	...	...	...	...	...	...	...	...	...	...
45768	2015-11-05	G2	E-frag.net	Inferno	8	5	9	7	2	1	2	1970	2299059
45769	2015-11-05	G2	E-frag.net	Dust2	10	6	8	5	1	1	2	1970	2299059
45770	2015-11-04	CLG	Liquid	Inferno	7	9	4	8	1	1	1	1934	2299011
45771	2015-11-03	NiP	Dignitas	Train	4	12	3	1	1	2	1	1934	2299001
45772	2015-11-03	NiP	Envy	Cobblestone	4	12	3	6	1	2	1	1934	2299003

	Map	CT Round Wins	T Round Wins	CT Round Win Rate Percentage	Z-Statistic	p-value	Significant?
5	Nuke	58561	48102	54.90	45.290	0.000000e+00	Yes
7	Train	90147	76095	54.23	48.740	0.000000e+00	Yes
6	Overpass	75508	67366	52.85	30.463	8.113994e-204	Yes
4	Mirage	119334	111006	51.81	24.540	5.557588e-133	Yes
3	Inferno	93465	96862	49.11	-11.012	3.350468e-28	Yes
1	Cobblestone	43390	45332	48.91	-9.220	2.960643e-20	Yes
2	Dust2	50899	54195	48.43	-14.378	7.061988e-47	Yes
8	Vertigo	7559	8130	48.18	-6.447	1.141327e-10	Yes
0	Cache	55647	61750	47.40	-25.190	5.142636e-140	Yes

	Map	CT Start Wins	Total Matches	CT Start Win Rate Percentage
1	Cobblestone	1828	3513	52.04
3	Inferno	3864	7485	51.62
0	Cache	2361	4613	51.18
8	Vertigo	310	609	50.90
2	Dust2	2093	4114	50.88
4	Mirage	4585	9021	50.83
5	Nuke	2114	4206	50.26
7	Train	3280	6566	49.95
6	Overpass	2786	5625	49.53

An analysis of Counter-Strike: Global Offensive pro matches from 2015-2020 ("The Golden Age")¶

Summer 2025 Data Science Project¶

Introduction¶

Basic overview of CS:GO¶

Our analysis¶

Data Preprocessing¶

C1: CT vs. T Round Win Rate By Map¶

C2: Has starting side ever mattered for match wins?¶

C3: Graphing Pro Player ADR vs Rating¶

Predicting Map Picks with Machine Learning¶

Conclusion¶

Key Findings¶

Implications for the CS:GO Community¶

Limitations and Future Work¶