#-----------------------------------------------------------------------
#                                            Prof. Dr. Walmes M. Zeviani
#                                leg.ufpr.br/~walmes · github.com/walmes
#                                        walmes@ufpr.br · @walmeszeviani
#                      Laboratory of Statistics and Geoinformation (LEG)
#                Department of Statistics · Federal University of Paraná
#                                       2019-mai-25 · Curitiba/PR/Brazil
#-----------------------------------------------------------------------

#-----------------------------------------------------------------------
# Importação com o Python.

# Importação do módulo Pandas.
import numpy as np
import pandas as pd

# 1. Fazer a importação do arquivo disponível em
#    <http://leg.ufpr.br/~walmes/data/euro_football_players.txt>.

# url = "http://leg.ufpr.br/~walmes/data/euro_football_players.txt"
url = "./data/euro_football_players.txt"
tb = pd.read_csv(url, delimiter = "\t", comment = "#")
tb.info()

# 2. Determinar o número de registros por pais (`country`).
tb.groupby(['country']).size().\
    sort_values(ascending = False).reset_index()

# 3. Determinar o número de registros por time (`team`).
tb.groupby(['team']).size().\
    sort_values(ascending = False).reset_index()

# 4. A média de idade (`age`) geral dos jogadores.
tb.age.mean()
tb['age'].mean()
tb.loc[:, 'age'].mean()
tb.loc[:, ['age']].mean()

# 5. A mediana de peso (`kg`) geral dos jogadores.
tb['kg'].median()

# 6. Os percentis de 10 e 90% da distribuição da altura (`cm`).
tb['cm'].quantile([0.10,  0.90])

# 7. Obter a correlação entre peso e altura.
tb[['kg', 'cm']].corr()

# 8. A tupla do jogador com o maior número de gols (`goal`).
tb.sort_values(by = ['goal'], ascending = False).head(n = 1).T

# 9. A tupla do jogador mais alto (`cm`).
tb.sort_values(by = ['cm'], ascending = False).head(n = 1).T

# 10. Os 10 jogadores de maior rating (`rt`).
tb.sort_values(by = ['rt'], ascending = False).head(n = 10)

# 11. Os 10 jogadores mais altos (`cm`).
tb.sort_values(by = ['cm'], ascending = False).head(n = 10)

# 12. Filtrar para jogadores que entraram em campo (`apps` > 0).
tb.apps.count()
tb.apps.isna().sum()
tb.apps.notna().sum()

tb2 = tb[tb.apps.notna()]
tb2.shape

# 13. Imputar 0 no lugar dos valores ausentes em cartões amarelos
#     (`yel`), vermelhos (`red`), gols (`goal`), e assistências a gol
#     (`ass`).
v = ['yel', 'red', 'goal', 'ass']
tb.loc[:, v].count()                   # Antes.
tb.loc[:, v] = tb.loc[:, v].fillna(0)  # Preenche.
tb.loc[:, v].count()                   # Depois.

# 14. Criar um variável com faixa de idade (`age`) em grupos de 5 anos.
tb.age.min()
tb.age.max()
tb['faixaet'] = pd.cut(x = tb.age,
                       bins = [15, 20, 25, 30, 35, 40, 45])

# Pode usar funções da numpy para criar sequências regulares.
np.arange(start = 15, stop = 45, step = 5)

# 15. Determinar o número de jogadores em cada faixa de idade.
tb.faixaet.value_counts(sort = False)

# 16. Criar a variável índice de massa corporal.
tb['bmi'] = tb.kg/(tb.cm/100)**2
tb.bmi.mean()

# 17. Criar as faixa de índice de massa corporal.
bns = [0, 18.5, 25, 30, 35, 40, np.inf]
lbs = ['Abaixo', 'Normal', 'Acima',
       'Obesidade I', 'Obesidade II', 'Obesidade III']
# Vetor de teste.
u = pd.Series([18.4999, 18.5, 30, 50])
pd.cut(x = u, bins = bns, labels = lbs,
       include_lowest = True, right = False)

tb['bmi_cls'] = pd.cut(x = tb.bmi,
                       bins = bns, labels = lbs,
                       include_lowest = True, right = False)
u = tb.bmi_cls.value_counts(sort = False).reset_index()
u.rename(columns = {'index': u.columns[1],  u.columns[1]: 'freq'},
         inplace = True)
u['freqrel'] = u.freq/u.freq.sum()
u

tb.bmi_cls.value_counts(sort = False, normalize = True).reset_index()

# 18. Obter a média de idade por time.
tb.groupby(['country']).age.mean()
tb.groupby(['country']).aggregate({"age": "mean"})
tb.groupby(['country']).agg({"age": "mean"})
tb.groupby(['country']).agg({"age": ["mean"]})

# 19. Obter a mediana de peso por pais.
tb.groupby(['country']).kg.median()

# 20. Obter a média de altura por posição em que joga.
tb.pos.unique()
tb.groupby(['pos']).cm.mean()

# 21. Obter o rating médio por time.
tb.groupby(['team']).rt.mean()

# 22. Obter o jogador com mais gols em cada time.
tb.sort_values(by = 'goal', ascending = False).\
    groupby(['country']).head(1)[['name', 'team', 'goal']]

# 23. Obter a proporção de jogadores italianos por time.
tb.groupby(['team']).\
    agg({"country": lambda x: x.isin(["Italy"]).sum()/x.size})

# pd.crosstab(index = tb.team, columns = tb.country)
pd.crosstab(index = tb.team,
            columns = tb.country.isin(['Italy']))

# 24. Obter a média, desvio-padrão e amplitude para número de gols por
#     time.
def range(x):
    return(x.max() - x.min())

u = tb.groupby(['team']).\
    agg({"goal": ["mean", "std", range]})
u

u.columns
u.columns.get_level_values(0)
u.columns.get_level_values(1)

u.columns = ["_".join(x) for x in u.columns.ravel()]
u.reset_index(inplace = True)
u

# 25. Obter por time o total de gols, média de altura e amplitude de
#     `bmi`.
u = tb.groupby(['team']).\
    agg({"goal": ["sum"],
         "cm": ["mean"],
         "bmi": [range]})
u.columns = ["_".join(x) for x in u.columns.ravel()]
u.reset_index(inplace = True)
u

#-----------------------------------------------------------------------
# Extra: cópia ou clone?

tb2 = tb
tb3 = tb.copy(deep = True)
tb4 = tb.copy(deep = False)

id(tb)
id(tb2)
id(tb3)
id(tb4)

tb.at[0, 'age'] = 40
tb2.at[0, 'age']
tb3.at[0, 'age'] # deep = True (defaut).
tb4.at[0, 'age'] # deep = False.

tb2.head(n = 3)
tb3.head(n = 3)

#-----------------------------------------------------------------------