PANDAS

DEMO

data import

NORSEMEN.csv

ID,NAME,AGE
001,Bjorn,21
002,Knut,26
003,Ragnar,24
004,Leif,32
005,Frida,22

NORSEMEN2.csv

ID,NAME,AGE
1,Bjorn,21
2,Knut,26
3,Ragnar,24
5,Frida,22

NORSEGODS.csv

IDS,NAME,MISC
101,Odin,Allfather
102,Thor,God of thunder
103,Tyr,God of war
105,Freya,Goddess of love

code

#!/usr/bin/env python3
import pandas as pd

df_norsemen = pd.read_csv("NORSEMEN.csv", dtype={"ID" : str})
df_norsegods= pd.read_csv("NORSEGODS.csv", dtype={"IDS" : str})

df_norsegods['AGE'] = None
df_norsegods.rename(columns={'IDS' : 'ID'}, inplace=True)

df_norsemen['MISC'] = None
df_norsemen = df_norsemen[['ID', 'NAME', 'MISC', 'AGE']]


df_merged = pd.concat([df_norsemen, df_norsegods], ignore_index=True)
df_merged.to_csv("merged.csv", index=False)


# get names
names = list(df_merged["NAME"])
assert(type(names) == list)

namestoo = df_merged.iloc[:,0]
assert(type(namestoo) == pd.core.series.Series)
# TODO : how to manipulate Series in pandas



# at syntax
assert(df_merged.at[1, "NAME"] == 'Knut')



# keep rows with specific conditions
df2 = df_merged.loc[df_merged['AGE'] > 22]

# keep only rows with empty "AGE"
df3 = df_merged.loc[df_merged['AGE'].isna()]
# same as : df3 = df_merged.loc[df_merged['AGE'].isnull()]

# get 3 first rows
df4 = df_merged.iloc[:3]

# make new df with only columns ID and NAME
df5 = df_merged[['ID', 'NAME']]


# get columns on dfA but not on dfB : 
dfA = pd.read_csv("NORSEMEN.csv", sep=',')
dfB = pd.read_csv("NORSEMEN2.csv", sep=',')

merged = dfA.merge(dfB, how='left', indicator=True)
only_in_dfA = merged[merged['_merge']=='left_only'].drop('_merge', axis=1)

# TODO : left join

left join x config file

cfg.ini

[INPUT]
filename1 = FIRSTNAMES.csv
filename2 = LASTNAMES.csv

[COLNAMES]
left=IDS
right=IDS
drop=HEIGHT,AGE

main.py

import pandas 
import configparser

config = configparser.ConfigParser()
assert(config.read('cfg.ini') != [] )

file1 = config['INPUT']['filename1']
file2 = config['INPUT']['filename2']

join_on_file1 = config['COLNAMES']['left']
join_on_file2 = config['COLNAMES']['right']

df1 = pandas.read_csv(file1, sep=';')
df2 = pandas.read_csv(file2, sep=';')

result = pandas.merge(df1, df2, left_on=join_on_file1, right_on=join_on_file2, how='left').drop_duplicates()

columns_to_drop = config['COLNAMES']['drop'].split(',')

for i in list(result.columns) : 
    if i in columns_to_drop : 
        result.drop(columns=[i], inplace=True)

result.to_csv('out.csv', index=False)

LEFT JOIN

# left join demo python

import pandas

data1 = {
        'IDS' : [0, 1, 2, 3],
        'NAMES' : ['Ali', 'Dean', 'Paul', 'Jack'],
        'AGES' : [18, 22, None, 33]
}
df1 = pandas.DataFrame(data1)


data2 = {
        'ID' : [3, 1, 2],
        'JOBS' : ['plumber', 'artist', 'poet']
}
df2 = pandas.DataFrame(data2)


result = pandas.merge(df1, df2, left_on='IDS', right_on="ID", how="left").drop_duplicates()
result.drop(columns=['ID'], inplace=True) # or result = result.drop('ID', axis=1)

print(result)

float -> nona -> int

# something missing here ?
df['ID'] = df['ID'].fillna(0).astype(int)

basic stuff

# number of lignes in df
print(len(df))

# number of columns in df
print(len(df.columns))

# drop column, right ?
df = df.drop(index=3)

# careful with this one, column indexes != column number ?
df = df.drop(df.index[2])

print(df["COLNAME"].dtype)

assert(df_cible["ID"].is_unique)

work with csv

df = pd.read_csv("input.csv", sep=";", dtype={"COLNAME" : "string"})
df.to_csv("output.csv", index=False, sep=";")

type conversions

df["COLNAME"] = df["COLNAME"].str.replace(",", ".").astype(float).astype(int)


# NAs -> 0 -> NaN
df["COLNAME"] = df["COLNAME"].fillna(0).astype(int)
df["COLNAME"] = df["COLNAME"].replace(0, pd.NA)

  

def import_df(name) :
	df = pd.read_csv(name, sep=";", dtype={"A" : "string"})
	df["B"] = df["B"].str.replace(",", ".").astype(float).astype(int)
	df["DATE1"] = pd.to_datetime(df["DATE1"])
	df["AFLOAT"] = df["AFLOAT"].fillna(0).astype(int) # float -> int
	df["AFLOAT"] = df["AFLOAT"].replace(0, pd.NA) # put back NA instead of 0
	
	return df

dates

df["DATE"] = pd.to_datetime(df["DATE"])

# convert col to datetime
df_reduced.loc[:, "DATE"] = pd.to_datetime(
df_reduced["DATE"], format="%d/%m/%y %H:%M", errors="coerce")

filters

df.loc[df["DIFF"] == 0]

df.loc[df['COLNAME']=='OLD_VALUE', 'COLNAME'] = 'RENAMED_VALUE'

  

# filter by month
df.loc[pd.to_datetime(df['DATE']).dt.month == month]

  

# Calculate date1 - date2 into new column
df_v2.loc[:, "time"] = df.loc[:,"DATE1"] - df.loc[:, "DATE2"]

  

# exclude lignes containing "foo" or "bar"
df = df.loc[~df["ID"].isin(["foo", "bar"])]

df = df.sort_values(by=['COL1', 'COL2'])

df = df.drop_duplicates(subset=['COL1'], keep='last')

functions


def extract_period(df, date_start, date_end):
	date_start = pd.to_datetime(date_start, format="%d/%m/%Y")
	date_end = pd.to_datetime(date_end, format="%d/%m/%Y")
	df_filtered = df[(df["DATE"] >= date_start) & (df["DATE"] <= date_end)]
	
	return df_filtered

  

def sum_col(df):
	col = df["COLNAME"]
	col = col.str.replace(",", ".").astype(float)
	
	return sum(col)

plot

states = df_v2['States'].value_counts().reset_index()

plt.bar(states['States'], states['count'], color=['blue', 'green', 'red', 'lightblue'], width=0.4)

plt.xlabel("XNAME")
plt.ylabel("YNAME")
plt.title("GRAPH NAME")

plt.show()