PANDAS
DEMO
data import
NORSEMEN.csv
ID,NAME,AGE
001,Bjorn,21
002,Knut,26
003,Ragnar,24
004,Leif,32
005,Frida,22
NORSEMEN2.csv
ID,NAME,AGE
1,Bjorn,21
2,Knut,26
3,Ragnar,24
5,Frida,22
NORSEGODS.csv
IDS,NAME,MISC
101,Odin,Allfather
102,Thor,God of thunder
103,Tyr,God of war
105,Freya,Goddess of love
code
#!/usr/bin/env python3
import pandas as pd
df_norsemen = pd.read_csv("NORSEMEN.csv", dtype={"ID" : str})
df_norsegods= pd.read_csv("NORSEGODS.csv", dtype={"IDS" : str})
df_norsegods['AGE'] = None
df_norsegods.rename(columns={'IDS' : 'ID'}, inplace=True)
df_norsemen['MISC'] = None
df_norsemen = df_norsemen[['ID', 'NAME', 'MISC', 'AGE']]
df_merged = pd.concat([df_norsemen, df_norsegods], ignore_index=True)
df_merged.to_csv("merged.csv", index=False)
# get names
names = list(df_merged["NAME"])
assert(type(names) == list)
namestoo = df_merged.iloc[:,0]
assert(type(namestoo) == pd.core.series.Series)
# TODO : how to manipulate Series in pandas
# at syntax
assert(df_merged.at[1, "NAME"] == 'Knut')
# keep rows with specific conditions
df2 = df_merged.loc[df_merged['AGE'] > 22]
# keep only rows with empty "AGE"
df3 = df_merged.loc[df_merged['AGE'].isna()]
# same as : df3 = df_merged.loc[df_merged['AGE'].isnull()]
# get 3 first rows
df4 = df_merged.iloc[:3]
# make new df with only columns ID and NAME
df5 = df_merged[['ID', 'NAME']]
# get columns on dfA but not on dfB :
dfA = pd.read_csv("NORSEMEN.csv", sep=',')
dfB = pd.read_csv("NORSEMEN2.csv", sep=',')
merged = dfA.merge(dfB, how='left', indicator=True)
only_in_dfA = merged[merged['_merge']=='left_only'].drop('_merge', axis=1)
# TODO : left join
left join x config file
cfg.ini
[INPUT]
filename1 = FIRSTNAMES.csv
filename2 = LASTNAMES.csv
[COLNAMES]
left=IDS
right=IDS
drop=HEIGHT,AGE
main.py
import pandas
import configparser
config = configparser.ConfigParser()
assert(config.read('cfg.ini') != [] )
file1 = config['INPUT']['filename1']
file2 = config['INPUT']['filename2']
join_on_file1 = config['COLNAMES']['left']
join_on_file2 = config['COLNAMES']['right']
df1 = pandas.read_csv(file1, sep=';')
df2 = pandas.read_csv(file2, sep=';')
result = pandas.merge(df1, df2, left_on=join_on_file1, right_on=join_on_file2, how='left').drop_duplicates()
columns_to_drop = config['COLNAMES']['drop'].split(',')
for i in list(result.columns) :
if i in columns_to_drop :
result.drop(columns=[i], inplace=True)
result.to_csv('out.csv', index=False)
LEFT JOIN
# left join demo python
import pandas
data1 = {
'IDS' : [0, 1, 2, 3],
'NAMES' : ['Ali', 'Dean', 'Paul', 'Jack'],
'AGES' : [18, 22, None, 33]
}
df1 = pandas.DataFrame(data1)
data2 = {
'ID' : [3, 1, 2],
'JOBS' : ['plumber', 'artist', 'poet']
}
df2 = pandas.DataFrame(data2)
result = pandas.merge(df1, df2, left_on='IDS', right_on="ID", how="left").drop_duplicates()
result.drop(columns=['ID'], inplace=True) # or result = result.drop('ID', axis=1)
print(result)
float -> nona -> int
# something missing here ?
df['ID'] = df['ID'].fillna(0).astype(int)
basic stuff
# number of lignes in df
print(len(df))
# number of columns in df
print(len(df.columns))
# drop column, right ?
df = df.drop(index=3)
# careful with this one, column indexes != column number ?
df = df.drop(df.index[2])
print(df["COLNAME"].dtype)
assert(df_cible["ID"].is_unique)
work with csv
df = pd.read_csv("input.csv", sep=";", dtype={"COLNAME" : "string"})
df.to_csv("output.csv", index=False, sep=";")
type conversions
df["COLNAME"] = df["COLNAME"].str.replace(",", ".").astype(float).astype(int)
# NAs -> 0 -> NaN
df["COLNAME"] = df["COLNAME"].fillna(0).astype(int)
df["COLNAME"] = df["COLNAME"].replace(0, pd.NA)
def import_df(name) :
df = pd.read_csv(name, sep=";", dtype={"A" : "string"})
df["B"] = df["B"].str.replace(",", ".").astype(float).astype(int)
df["DATE1"] = pd.to_datetime(df["DATE1"])
df["AFLOAT"] = df["AFLOAT"].fillna(0).astype(int) # float -> int
df["AFLOAT"] = df["AFLOAT"].replace(0, pd.NA) # put back NA instead of 0
return df
dates
df["DATE"] = pd.to_datetime(df["DATE"])
# convert col to datetime
df_reduced.loc[:, "DATE"] = pd.to_datetime(
df_reduced["DATE"], format="%d/%m/%y %H:%M", errors="coerce")
filters
df.loc[df["DIFF"] == 0]
df.loc[df['COLNAME']=='OLD_VALUE', 'COLNAME'] = 'RENAMED_VALUE'
# filter by month
df.loc[pd.to_datetime(df['DATE']).dt.month == month]
# Calculate date1 - date2 into new column
df_v2.loc[:, "time"] = df.loc[:,"DATE1"] - df.loc[:, "DATE2"]
# exclude lignes containing "foo" or "bar"
df = df.loc[~df["ID"].isin(["foo", "bar"])]
df = df.sort_values(by=['COL1', 'COL2'])
df = df.drop_duplicates(subset=['COL1'], keep='last')
functions
def extract_period(df, date_start, date_end):
date_start = pd.to_datetime(date_start, format="%d/%m/%Y")
date_end = pd.to_datetime(date_end, format="%d/%m/%Y")
df_filtered = df[(df["DATE"] >= date_start) & (df["DATE"] <= date_end)]
return df_filtered
def sum_col(df):
col = df["COLNAME"]
col = col.str.replace(",", ".").astype(float)
return sum(col)
plot
states = df_v2['States'].value_counts().reset_index()
plt.bar(states['States'], states['count'], color=['blue', 'green', 'red', 'lightblue'], width=0.4)
plt.xlabel("XNAME")
plt.ylabel("YNAME")
plt.title("GRAPH NAME")
plt.show()