pandas

url_titanic = "https://gist.githubusercontent.com/butuzov/ed1c7f8f3affe6dd005c1ee40dc3f7f2/raw/87c7d72e009f1965263edc3431adbd4fab69f387/titanic.csv"
url_economics = "https://gist.githubusercontent.com/butuzov/ed1c7f8f3affe6dd005c1ee40dc3f7f2/raw/87c7d72e009f1965263edc3431adbd4fab69f387/economics.csv"

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

# Just So we can log 
# nice logging
from icecream import ic
import sys, re
def printError(e): print("Error: {}".format(e), file=sys.stderr)
    
def jupyter(*args): 
    print(*[re.sub(r",\s{1,}", ", ", i.replace(",\n", ", ")) for i in args], file=sys.stdout)
    
ic.configureOutput(prefix='ic> ', outputFunction=jupyter)

rnd = np.random.RandomState(42)

Pandas

Talks:

“How to use pandas the wrong way”, by Pietro Battiston @EuroPython 2017 ( code )
“Neat Analytics with Pandas Indexes”, by Alexander Hendorf @EuroPython 2017 ( slides )
“Data Wrangling & Visualization with Pandas and Jupyter”, by Alexander Hendorf @EuroPython 2018
“Pandas for Data Analysis” by Daniel Chen, @SciPy 2017 ( code )

Docks:

https://pandas.pydata.org/pandas-docs/stable/reference/

Books:

#Getting Started

pandas allow us plot, explore, data. perform math on large sets of it.

#Versions and general overview

# version
ic(pd.__version__)

> ic> pd.__version__: '1.5.3'

result >>> '1.5.3'

economics = pd.read_csv(url_economics, index_col='date',parse_dates=True)
economics.head(2)

	pce	pop	psavert	uempmed	unemploy
date
1967-07-01	507.4	198712	12.5	4.5	2944
1967-08-01	510.5	198911	12.5	4.7	2945

#Options

https://pandas.pydata.org/pandas-docs/stable/user_guide/options.html#available-options

ic(pd.describe_option("display.max_rows"))

with pd.option_context('display.max_rows', 2):
    ic(pd.get_option("display.max_rows"))
ic(pd.get_option("display.max_rows"))
print("----"*20)
ic(pd.get_option("display.max_rows"))
ic(pd.set_option("display.max_rows", 2))
ic(pd.get_option("display.max_rows"))
print("----"*20)
ic(pd.reset_option("display.max_rows"))
ic(pd.get_option("display.max_rows"))

> display.max_rows : int
> If max_rows is exceeded, switch to truncate view. Depending on
> `large_repr`, objects are either centrally truncated or printed as
> a summary view. 'None' value means unlimited.
> 
> In case python/IPython is running in a terminal and `large_repr`
> equals 'truncate' this can be set to 0 and pandas will auto-detect
> the height of the terminal and print a truncated object which fits
> the screen height. The IPython notebook, IPython qtconsole, or
> IDLE do not run in a terminal and hence it is not possible to do
> correct auto-detection.
> [default: 60] [currently: 60]
> ic> pd.describe_option("display.max_rows"): None
> ic> pd.get_option("display.max_rows"): 2
> ic> pd.get_option("display.max_rows"): 60
> --------------------------------------------------------------------------------
> ic> pd.get_option("display.max_rows"): 60
> ic> pd.set_option("display.max_rows", 2): None
> ic> pd.get_option("display.max_rows"): 2
> --------------------------------------------------------------------------------
> ic> pd.reset_option("display.max_rows"): None
> ic> pd.get_option("display.max_rows"): 60

result >>> 60

#DataTypes: Series and DataFrames

Types:

np.bool (bool) - Stored as a single byte.
np.int (int) - Defaulted to 64 bits, Unsigned ints is alaso available (np.uint)
np.float (float) - Defaulted to 64 bits.
np.complex (complex) - Rarely seen in DA
np.object (O, object) - Typically strings but is a catch- all for columns with multiple different types or other Python objects (tuples, lists, dicts, and so on).
np.datetime64, pd.Timestamp (datetime64) - Specific moment in time with nanosecond precision.
np.timedelta64,pd.Timedelta (timedelta64) - An amount of time, from days to nanoseconds.
pd.Categorical (Categorical) - Specific only to pandas. Useful for object columns with relatively few unique values.

ic(economics.dtypes.value_counts())

print("-"*60)

ic(economics.dtypes)

> ic> economics.dtypes.value_counts(): float64    3
> int64      2
> dtype: int64
> ------------------------------------------------------------
> ic> economics.dtypes: pce         float64
> pop           int64
> psavert     float64
> uempmed     float64
> unemploy      int64
> dtype: object

result >>> pce         float64
result >>> pop           int64
result >>> psavert     float64
result >>> uempmed     float64
result >>> unemploy      int64
result >>> dtype: object

#`pd.Series`

numpy array with labels

rnd_series = pd.Series(rnd.randint(0, 10, 6))

ic(type(rnd_series))
ic(rnd_series.shape)
ic(rnd_series.index)
ic(rnd_series.value_counts(normalize=True))
ic(rnd_series.hasnans)

> ic> type(rnd_series): <class 'pandas.core.series.Series'>
> ic> rnd_series.shape: (6,)
> ic> rnd_series.index: RangeIndex(start=0, stop=6, step=1)
> ic> rnd_series.value_counts(normalize=True): 6    0.333333
> 3    0.166667
> 7    0.166667
> 4    0.166667
> 9    0.166667
> dtype: float64
> ic> rnd_series.hasnans: False

result >>> False

#`pd.DataFrame`

economics = pd.read_csv(url_economics, index_col='date',parse_dates=True)
economics.head(2)

	pce	pop	psavert	uempmed	unemploy
date
1967-07-01	507.4	198712	12.5	4.5	2944
1967-08-01	510.5	198911	12.5	4.7	2945

# row
ic(economics.values[0])
ic(type(economics.values))
ic(economics.shape)
ic(economics.columns[0:4])
ic(economics.index[0])

> ic> economics.values[0]: array([5.07400e+02, 1.98712e+05, 1.25000e+01, 4.50000e+00, 2.94400e+03])
> ic> type(economics.values): <class 'numpy.ndarray'>
> ic> economics.shape: (574, 5)
> ic> economics.columns[0:4]: Index(['pce', 'pop', 'psavert', 'uempmed'], dtype='object')
> ic> economics.index[0]: Timestamp('1967-07-01 00:00:00')

result >>> Timestamp('1967-07-01 00:00:00')

economics.info()

> <class 'pandas.core.frame.DataFrame'>
> DatetimeIndex: 574 entries, 1967-07-01 to 2015-04-01
> Data columns (total 5 columns):
> #   Column    Non-Null Count  Dtype
> ---  ------    --------------  -----
> 0   pce       574 non-null    float64
> 1   pop       574 non-null    int64
> 2   psavert   574 non-null    float64
> 3   uempmed   574 non-null    float64
> 4   unemploy  574 non-null    int64
> dtypes: float64(3), int64(2)
> memory usage: 26.9 KB

#Data: Generate Data (`numpy`)

pandas use numpy to generate indexes, and perform operations on matrixes

#Random data in Dataframe/Series

# generating random data based on numpy generated array
pd.DataFrame(rnd.randint(0, 10, 6).reshape(-1, 6)).head()

	0	1	2	3	4	5
0	2	6	7	4	3	7

#Data: Import and Export

https://pandas.pydata.org/pandas-docs/stable/reference/io.html

#Import

We can import data from: csv, excel, excel, hdfs, etc…

# Read a comma-separated values (csv) file into DataFrame.
economics = pd.read_csv(url_economics, index_col='date', parse_dates=True)
economics.head(3)

	pce	pop	psavert	uempmed	unemploy
date
1967-07-01	507.4	198712	12.5	4.5	2944
1967-08-01	510.5	198911	12.5	4.7	2945
1967-09-01	516.3	199113	11.7	4.6	2958

#Export

# excel export depends on openpyxl
#> pip install openpyxl 2>&1 1>/dev/null

> 
> [1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
> [1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

# Write DataFrame to a comma-separated values (csv) file.
economics.to_csv(path_or_buf="data.csv", index=False)

# and actual saving dataframe to excel
economics.to_excel("data.xlsx")

#> unlink data.xlsx
#> unlink data.csv

#Export to data structures

dfdict = economics.to_dict()
ic(len(dfdict))
ic(dfdict.keys())

> ic> len(dfdict): 5
> ic> dfdict.keys(): dict_keys(['pce', 'pop', 'psavert', 'uempmed', 'unemploy'])

result >>> dict_keys(['pce', 'pop', 'psavert', 'uempmed', 'unemploy'])

#Data: Indexing

#`pd.Index`

# manualy created index
indexes=pd.Index([0,1,2,3,4,5,6])

ic(type(indexes))
ic(indexes.size)
ic(indexes.shape) 
ic(indexes.dtype)

> ic> type(indexes): <class 'pandas.core.indexes.numeric.Int64Index'>
> ic> indexes.size: 7
> ic> indexes.shape: (7,)
> ic> indexes.dtype: dtype('int64')

result >>> dtype('int64')

Create index and sort index on existing dataframe

economics.set_index('unemploy').sort_index().head(5)

	pce	pop	psavert	uempmed
unemploy
2685	577.2	201621	10.9	4.4
2686	568.8	201095	10.4	4.6
2689	572.3	201290	10.6	4.8
2692	589.5	201881	9.4	4.9
2709	544.6	200208	12.2	4.6

and reset index

economics.set_index('unemploy').reset_index().head(2)

	unemploy	pce	pop	psavert	uempmed
0	2944	507.4	198712	12.5	4.5
1	2945	510.5	198911	12.5	4.7

#`pd.DatetimeIndex`

https://docs.scipy.org/doc/numpy/reference/arrays.datetime.html

# we can use numpy generated arrys for indexes or data
np.arange('1993-01-01', '1993-01-20', dtype="datetime64[W]")

result >>> array(['1992-12-31', '1993-01-07'], dtype='datetime64[W]')

# date generation
pd.date_range("2001-01-01", periods=3, freq="w")

result >>> DatetimeIndex(['2001-01-07', '2001-01-14', '2001-01-21'], dtype='datetime64[ns]', freq='W-SUN')

# autoguessing date format
pd.to_datetime(['1/2/2018', 'Jan 04, 2018'])

result >>> DatetimeIndex(['2018-01-02', '2018-01-04'], dtype='datetime64[ns]', freq=None)

# providing dateformat
pd.to_datetime(['2/1/2018', '6/1/2018'], format="%d/%m/%Y")

result >>> DatetimeIndex(['2018-01-02', '2018-01-06'], dtype='datetime64[ns]', freq=None)

# bussines days 
pd.date_range("2018-01-02", periods=3, freq='B')

result >>> DatetimeIndex(['2018-01-02', '2018-01-03', '2018-01-04'], dtype='datetime64[ns]', freq='B')

Parsing dates

economics = pd.read_csv(url_economics,  parse_dates=True)
economics.head(3)

	date	pce	pop	psavert	uempmed	unemploy
0	1967-07-01	507.4	198712	12.5	4.5	2944
1	1967-08-01	510.5	198911	12.5	4.7	2945
2	1967-09-01	516.3	199113	11.7	4.6	2958

pd.to_datetime(economics['date'], format='%Y-%m-%d').dt.strftime("%Y")[1:4]

result >>> 1    1967
result >>> 2    1967
result >>> 3    1967
result >>> Name: date, dtype: object

post load datetime transformation

economics = pd.read_csv(url_economics)
economics['date'] = pd.to_datetime(economics['date'])
economics.set_index('date',inplace=True)
economics.head(3)

	pce	pop	psavert	uempmed	unemploy
date
1967-07-01	507.4	198712	12.5	4.5	2944
1967-08-01	510.5	198911	12.5	4.7	2945
1967-09-01	516.3	199113	11.7	4.6	2958

#Labels

tmp = pd.Series(['Oleg', 'Developer'], index=['person', 'who'])

#  loc allow to request data by named index
# iloc allow to request data by numeric index 
ic(tmp.loc['person'])
ic(tmp.iloc[1])

> ic> tmp.loc['person']: 'Oleg'
> ic> tmp.iloc[1]: 'Developer'

result >>> 'Developer'

tmp = pd.Series(range(26), index=[x for x in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'])

ic(tmp[3:9].values)
ic(tmp["D":"I"].values)
ic(tmp.iloc[3:9].values)

> ic> tmp[3:9].values: array([3, 4, 5, 6, 7, 8])
> ic> tmp["D":"I"].values: array([3, 4, 5, 6, 7, 8])
> ic> tmp.iloc[3:9].values: array([3, 4, 5, 6, 7, 8])

result >>> array([3, 4, 5, 6, 7, 8])

re-labeling / re-indexing

# reindexing
tmp.index = [x for x in 'GATTACAHIJKLMNOPQRSTUVWXYZ']
ic(tmp.loc['G'])

# Requesting non uniq values
try:
    tmp.loc['G':'A']
except KeyError as e:
    printError(e)

stderr> Error: "Cannot get right slice bound for non-unique label: 'A'"

#Index Based Access

economics = pd.read_csv(url_economics, index_col=["date"], parse_dates=True)
economics.head(3)

	pce	pop	psavert	uempmed	unemploy
date
1967-07-01	507.4	198712	12.5	4.5	2944
1967-08-01	510.5	198911	12.5	4.7	2945
1967-09-01	516.3	199113	11.7	4.6	2958

# access top data using column name
economics["unemploy"].head(2)

result >>> date
result >>> 1967-07-01    2944
result >>> 1967-08-01    2945
result >>> Name: unemploy, dtype: int64

# or using column as property
economics.unemploy.head(2)

result >>> date
result >>> 1967-07-01    2944
result >>> 1967-08-01    2945
result >>> Name: unemploy, dtype: int64

economics.iloc[[0], :]
economics.iloc[[0,2,5,4], [0,1,2]]

	pce	pop	psavert
date
1967-07-01	507.4	198712	12.5
1967-09-01	516.3	199113	11.7
1967-12-01	525.8	199657	12.1
1967-11-01	518.1	199498	12.5

economics[['pce', 'pop']].head(2)

	pce	pop
date
1967-07-01	507.4	198712
1967-08-01	510.5	198911

loc based indexing

# ranges and selections
economics.loc['1967-07-01':'1967-09-01', ['pop', 'pce']].head(2)

	pop	pce
date
1967-07-01	198712	507.4
1967-08-01	198911	510.5

# ranges and ALL
economics.loc['1967-07-01':'1967-09-01', :].head(2)

	pce	pop	psavert	uempmed	unemploy
date
1967-07-01	507.4	198712	12.5	4.5	2944
1967-08-01	510.5	198911	12.5	4.7	2945

# ranges and ranges
economics.loc['1967-07-01':'1967-09-01', 'pce':'psavert'].head(2)

	pce	pop	psavert
date
1967-07-01	507.4	198712	12.5
1967-08-01	510.5	198911	12.5

iloc based indexes

# ranges and ALL
economics.iloc[0:5,:].head(2)

	pce	pop	psavert	uempmed	unemploy
date
1967-07-01	507.4	198712	12.5	4.5	2944
1967-08-01	510.5	198911	12.5	4.7	2945

# ranges and selection
economics.iloc[0:5,[0,3,4]].head(2)

	pce	uempmed	unemploy
date
1967-07-01	507.4	4.5	2944
1967-08-01	510.5	4.7	2945

# ranges and ranges
economics.iloc[0:5, 2:4].head(2)

	psavert	uempmed
date
1967-07-01	12.5	4.5
1967-08-01	12.5	4.7

# combining different aproaches
pd.concat([
    economics.iloc[0:1, 2:4],
    economics.iloc[6:7, 2:4],
    economics.loc['1978-07-01':'1978-09-01', ['psavert','uempmed']],
]).head(3)

	psavert	uempmed
date
1967-07-01	12.5	4.5
1968-01-01	11.7	5.1
1978-07-01	10.3	5.8

# combining preselected columns with loc.
economics[['pce', 'unemploy']].loc['1967-07-01':'1969-07-01'].head(2)

	pce	unemploy
date
1967-07-01	507.4	2944
1967-08-01	510.5	2945

# combining preselected columns with loc.
economics[['pce', 'unemploy']].iloc[10:14].head(2)

	pce	unemploy
date
1968-05-01	550.4	2740
1968-06-01	556.8	2938

#Conditional Indexes

economics.loc[((economics.pce <= 630) & (economics.pce >= 600)), ['psavert', 'unemploy']].head(2)

	psavert	unemploy
date
1969-05-01	10.0	2713
1969-06-01	10.9	2816

#Labeled Indexes (2)

size = 10
tmp = pd.DataFrame(rnd.randint(0, 10, size**2).reshape(size, -1), 
                  index=[f"R{x:02d}" for x in range(size)],
                  columns = [f"C{x:02d}" for x in range(size)])

tmp

	C00	C01	C02	C03	C04	C05	C06	C07	C08	C09
R00	7	2	5	4	1	7	5	1	4	0
R01	9	5	8	0	9	2	6	3	8	2
R02	4	2	6	4	8	6	1	3	8	1
R03	9	8	9	4	1	3	6	7	2	0
R04	3	1	7	3	1	5	5	9	3	5
R05	1	9	1	9	3	7	6	8	7	4
R06	1	4	7	9	8	8	0	8	6	8
R07	7	0	7	7	2	0	7	2	2	0
R08	4	9	6	9	8	6	8	7	1	0
R09	6	6	7	4	2	7	5	2	0	2

# rows not found.
tmp['C05':'C09']

	C00	C01	C02	C03	C04	C05	C06	C07	C08	C09

# but as column its ok!
tmp['C05']

result >>> R00    7
result >>> R01    2
result >>> R02    6
result >>> R03    3
result >>> R04    5
result >>> R05    7
result >>> R06    8
result >>> R07    0
result >>> R08    6
result >>> R09    7
result >>> Name: C05, dtype: int64

tmp['R05':'R06']

	C00	C01	C02	C03	C04	C05	C06	C07	C08	C09
R05	1	9	1	9	3	7	6	8	7	4
R06	1	4	7	9	8	8	0	8	6	8

#MultiIndex

pdlen = 20

tmp = pd.DataFrame(
    {
        'city'     : [x for x in ['Paris', 'London', 'Berlin', 'Manchester', 'Kyiv']*10][:pdlen],
        'category' : rnd.randint(0, 7, pdlen),
        'price'    : rnd.randint(10, 300, pdlen),
        'rating'   : rnd.randint(0, 5, pdlen),
    }
)

tmp['country'] = tmp['city'].map({
    'Paris':'FR', 
    'London':'UK', 
    'Berlin':'DE', 
    'Manchester':'US', 
    'Kyiv':'UA',
})

tmp.head(5)

	city	category	price	rating	country
0	Paris	4	72	4	FR
1	London	6	240	0	UK
2	Berlin	5	250	4	DE
3	Manchester	2	61	3	US
4	Kyiv	0	105	3	UA

tmp = tmp.groupby(['country', 'city', 'category']).mean()
tmp

			price	rating
country	city	category
DE	Berlin	0	22.0	0.0
		3	293.0	3.0
		5	250.0	4.0
		6	246.0	3.0
FR	Paris	1	252.0	2.0
		4	151.5	3.5
		6	38.0	3.0
UA	Kyiv	0	105.0	3.0
		2	179.0	0.0
		5	180.0	1.0
		6	196.0	0.0
UK	London	1	167.5	1.5
		2	45.0	0.0
		6	240.0	0.0
US	Manchester	2	61.0	3.0
		4	75.0	4.0
		6	160.5	1.0

# show all indexes and levels
ic(tmp.index)
ic(tmp.index.levels)
print("-"*70)
ic(tmp.index.names)
ic(tmp.index.values)
print("-"*70)
ic(tmp.index.get_level_values(2))
print("-"*70)
ic(tmp.loc["UA"])
print("-"*70)
ic(tmp.loc["UA", "Kyiv"].max())
ic(tmp.loc["UA", "Kyiv", 0:4])

> ic> tmp.index: MultiIndex([('DE', 'Berlin', 0), ('DE', 'Berlin', 3), ('DE', 'Berlin', 5), ('DE', 'Berlin', 6), ('FR', 'Paris', 1), ('FR', 'Paris', 4), ('FR', 'Paris', 6), ('UA', 'Kyiv', 0), ('UA', 'Kyiv', 2), ('UA', 'Kyiv', 5), ('UA', 'Kyiv', 6), ('UK', 'London', 1), ('UK', 'London', 2), ('UK', 'London', 6), ('US', 'Manchester', 2), ('US', 'Manchester', 4), ('US', 'Manchester', 6)], names=['country', 'city', 'category'])
> ic> tmp.index.levels: FrozenList([['DE', 'FR', 'UA', 'UK', 'US'], ['Berlin', 'Kyiv', 'London', 'Manchester', 'Paris'], [0, 1, 2, 3, 4, 5, 6]])
> ----------------------------------------------------------------------
> ic> tmp.index.names: FrozenList(['country', 'city', 'category'])
> ic> tmp.index.values: array([('DE', 'Berlin', 0), ('DE', 'Berlin', 3), ('DE', 'Berlin', 5), ('DE', 'Berlin', 6), ('FR', 'Paris', 1), ('FR', 'Paris', 4), ('FR', 'Paris', 6), ('UA', 'Kyiv', 0), ('UA', 'Kyiv', 2), ('UA', 'Kyiv', 5), ('UA', 'Kyiv', 6), ('UK', 'London', 1), ('UK', 'London', 2), ('UK', 'London', 6), ('US', 'Manchester', 2), ('US', 'Manchester', 4), ('US', 'Manchester', 6)], dtype=object)
> ----------------------------------------------------------------------
> ic> tmp.index.get_level_values(2): Int64Index([0, 3, 5, 6, 1, 4, 6, 0, 2, 5, 6, 1, 2, 6, 2, 4, 6], dtype='int64', name='category')
> ----------------------------------------------------------------------
> ic> tmp.loc["UA"]:                price  rating
> city category
> Kyiv 0         105.0     3.0
> 2         179.0     0.0
> 5         180.0     1.0
> 6         196.0     0.0
> ----------------------------------------------------------------------
> ic> tmp.loc["UA", "Kyiv"].max(): price     196.0
> rating      3.0
> dtype: float64
> ic> tmp.loc["UA", "Kyiv", 0:4]:                        price  rating
> country city category
> UA      Kyiv 0         105.0     3.0
> 2         179.0     0.0

			price	rating
country	city	category
UA	Kyiv	0	105.0	3.0
UA	Kyiv	2	179.0	0.0

# without inplace=True it will return new dataframe
tmp.rename(index={'UA':'ЮА'}, columns={'price':'Precio'}, inplace=True)
tmp

			Precio	rating
country	city	category
DE	Berlin	0	22.0	0.0
		3	293.0	3.0
		5	250.0	4.0
		6	246.0	3.0
FR	Paris	1	252.0	2.0
		4	151.5	3.5
		6	38.0	3.0
ЮА	Kyiv	0	105.0	3.0
		2	179.0	0.0
		5	180.0	1.0
		6	196.0	0.0
UK	London	1	167.5	1.5
		2	45.0	0.0
		6	240.0	0.0
US	Manchester	2	61.0	3.0
		4	75.0	4.0
		6	160.5	1.0

#`pd.Index` as Sets

# intersection
ic(pd.Index([1,2,3,4]) & pd.Index([3,4,5,6]))

# union
ic(pd.Index([1,2,3,4]) | pd.Index([3,4,5,6]))

# symetric difference
ic(pd.Index([1,2,3,4]) ^ pd.Index([3,4,5,6]))

stderr> /var/folders/7j/8x_gv0vs7f33q1cxc5cyhl3r0000gn/T/ipykernel_29394/1864070959.py:2: FutureWarning: Index.__and__ operating as a set operation is deprecated, in the future this will be a logical operation matching Series.__and__.  Use index.intersection(other) instead.
stderr> ic(pd.Index([1,2,3,4]) & pd.Index([3,4,5,6]))
stderr> /var/folders/7j/8x_gv0vs7f33q1cxc5cyhl3r0000gn/T/ipykernel_29394/1864070959.py:5: FutureWarning: Index.__or__ operating as a set operation is deprecated, in the future this will be a logical operation matching Series.__or__.  Use index.union(other) instead.
stderr> ic(pd.Index([1,2,3,4]) | pd.Index([3,4,5,6]))
stderr> /var/folders/7j/8x_gv0vs7f33q1cxc5cyhl3r0000gn/T/ipykernel_29394/1864070959.py:8: FutureWarning: Index.__xor__ operating as a set operation is deprecated, in the future this will be a logical operation matching Series.__xor__.  Use index.symmetric_difference(other) instead.
stderr> ic(pd.Index([1,2,3,4]) ^ pd.Index([3,4,5,6]))




result >>> Int64Index([1, 2, 5, 6], dtype='int64')

#Index Aligment

# source data area and population
data_area = { 'Alaska': 1723337,  'Texas': 695662, 'California': 423967}
data_ppls = { 'California': 38332521, 'Texas': 26448193, 'New York': 19651127}

area = pd.Series(data_area, name='area')
ppls = pd.Series(data_ppls, name='ppls')

# Index aligment
ic(ppls/area)

> ic> ppls/area: Alaska              NaN
> California    90.413926
> New York            NaN
> Texas         38.018740
> dtype: float64

result >>> Alaska              NaN
result >>> California    90.413926
result >>> New York            NaN
result >>> Texas         38.018740
result >>> dtype: float64

One more example

A = pd.Series([2, 4, 6], index=[0, 1, 2]) 
B = pd.Series([1, 3, 5], index=[1, 2, 3]) 

ic(A+B)

> ic> A+B: 0    NaN
> 1    5.0
> 2    9.0
> 3    NaN
> dtype: float64

result >>> 0    NaN
result >>> 1    5.0
result >>> 2    9.0
result >>> 3    NaN
result >>> dtype: float64

using functions methods (.add() instead + ) we can fill nan values

# using function we can fill nan values
ic(A.add(B, fill_value=100))

> ic> A.add(B, fill_value=100): 0    102.0
> 1      5.0
> 2      9.0
> 3    105.0
> dtype: float64

result >>> 0    102.0
result >>> 1      5.0
result >>> 2      9.0
result >>> 3    105.0
result >>> dtype: float64

economics = pd.read_csv(url_economics, index_col='date',parse_dates=True)

ic(economics.unemploy.idxmin())
ic(economics.unemploy.min())
ic(np.argmin(economics.unemploy))
print("-"*60)
ic(economics.unemploy.idxmax())
ic(economics.unemploy.max())
ic(np.argmax(economics.unemploy))

> ic> economics.unemploy.idxmin(): Timestamp('1968-12-01 00:00:00')
> ic> economics.unemploy.min(): 2685
> ic> np.argmin(economics.unemploy): 17
> ------------------------------------------------------------
> ic> economics.unemploy.idxmax(): Timestamp('2009-10-01 00:00:00')
> ic> economics.unemploy.max(): 15352
> ic> np.argmax(economics.unemploy): 507

result >>> 507

#Data: Handling Data

#Handling Missing Data

titanic = pd.read_csv(url_titanic, keep_default_na=False)
titanic['age'][4:7].head()

result >>> 4    35.0
result >>> 5        
result >>> 6    54.0
result >>> Name: age, dtype: object

# Additional strings to recognize as NA/NaN. 
# If dict passed, specific per-column NA values. 
# By default the following values are interpreted as 
# NaN: ‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, 
# ‘-nan’, ‘1.#IND’, ‘1.#QNAN’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘n/a’, ‘nan’, ‘null’.
titanic = pd.read_csv(url_titanic, na_values=[35.0])
titanic['age'][4:7].head()

result >>> 4     NaN
result >>> 5     NaN
result >>> 6    54.0
result >>> Name: age, dtype: float64

titanic = pd.read_csv(url_titanic, keep_default_na=True)
titanic['age'][4:7].head()

result >>> 4    35.0
result >>> 5     NaN
result >>> 6    54.0
result >>> Name: age, dtype: float64

ic(titanic['age'].hasnans)
ic(titanic['age'].isnull().sum())
ic(titanic['age'].dropna().hasnans)
ic(titanic['age'].dropna().isnull().sum())
ic(titanic['age'].notnull().sum())
ic(np.count_nonzero(titanic['age'].isnull()))

> ic> titanic['age'].hasnans: True
> ic> titanic['age'].isnull().sum(): 177
> ic> titanic['age'].dropna().hasnans: False
> ic> titanic['age'].dropna().isnull().sum(): 0
> ic> titanic['age'].notnull().sum(): 714
> ic> np.count_nonzero(titanic['age'].isnull()): 177

result >>> 177

titanic.age.value_counts(dropna=True).plot.pie()

getting rid of nans

ages = pd.DataFrame()

ages['fillna(0)']      = titanic['age'].fillna(0)
ages['fillna - ffill'] = titanic['age'].fillna(method='ffill')
ages['fillna - bfill'] = titanic['age'].fillna(method='bfill')
ages['dropna']         = titanic['age'].dropna(inplace=False )
ages['interpolate']    = titanic['age'].interpolate()
ages['min']            = titanic['age'].fillna(titanic['age'].min())
ages['mean']           = titanic['age'].fillna(titanic['age'].mean())
ages[4:7]

	fillna(0)	fillna - ffill	fillna - bfill	dropna	interpolate	min	mean
4	35.0	35.0	35.0	35.0	35.0	35.00	35.000000
5	0.0	35.0	54.0	NaN	44.5	0.42	29.699118
6	54.0	54.0	54.0	54.0	54.0	54.00	54.000000

#Melting/Mergin/Concating Data

titanic = pd.read_csv(url_titanic, keep_default_na=True)
titanic.head(3)

	survived	pclass	sex	age	sibsp	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22.0	1	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38.0	1	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26.0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True

Renaming columns and indexes

titanic.rename(columns={'sex':'gender'}, inplace=True)
titanic.head(3)

	survived	pclass	gender	age	sibsp	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22.0	1	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38.0	1	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26.0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True

droping columns

titanic.drop('embarked', axis=1, errors="ignore", inplace=True)
titanic.drop('who', axis=1, errors="ignore", inplace=True)
titanic.drop(['adult_male', 'alone'], axis=1, errors="ignore", inplace=True) 
titanic.drop(['parch', 'deck'], axis=1, errors="ignore", inplace=True)
titanic.head(3)

	survived	pclass	gender	age	sibsp	fare	class	embark_town	alive
0	0	3	male	22.0	1	7.2500	Third	Southampton	no
1	1	1	female	38.0	1	71.2833	First	Cherbourg	yes
2	1	3	female	26.0	0	7.9250	Third	Southampton	yes

# fares 
titanic = titanic.groupby(['class', 'embark_town', 'gender'])['age'].mean().reset_index()
titanic

	class	embark_town	gender	age
0	First	Cherbourg	female	36.052632
1	First	Cherbourg	male	40.111111
2	First	Queenstown	female	33.000000
3	First	Queenstown	male	44.000000
4	First	Southampton	female	32.704545
5	First	Southampton	male	41.897188
6	Second	Cherbourg	female	19.142857
7	Second	Cherbourg	male	25.937500
8	Second	Queenstown	female	30.000000
9	Second	Queenstown	male	57.000000
10	Second	Southampton	female	29.719697
11	Second	Southampton	male	30.875889
12	Third	Cherbourg	female	14.062500
13	Third	Cherbourg	male	25.016800
14	Third	Queenstown	female	22.850000
15	Third	Queenstown	male	28.142857
16	Third	Southampton	female	23.223684
17	Third	Southampton	male	26.574766

pivoting data

titanic.pivot_table(index=['class','gender'],columns='embark_town',values='age').reset_index().head(3)

embark_town	class	gender	Cherbourg	Queenstown	Southampton
0	First	female	36.052632	33.0	32.704545
1	First	male	40.111111	44.0	41.897188
2	Second	female	19.142857	30.0	29.719697

melting data

pd.melt(titanic, id_vars=['class', 'gender']).head(3)

	class	gender	variable	value
0	First	female	embark_town	Cherbourg
1	First	male	embark_town	Cherbourg
2	First	female	embark_town	Queenstown

# todo - cover 
# unstack()
# stack()

#Common Math operations over the DataFrames/Series

economics = pd.read_csv(url_economics, index_col='date',parse_dates=True)

economics.info()

> <class 'pandas.core.frame.DataFrame'>
> DatetimeIndex: 574 entries, 1967-07-01 to 2015-04-01
> Data columns (total 5 columns):
> #   Column    Non-Null Count  Dtype
> ---  ------    --------------  -----
> 0   pce       574 non-null    float64
> 1   pop       574 non-null    int64
> 2   psavert   574 non-null    float64
> 3   uempmed   574 non-null    float64
> 4   unemploy  574 non-null    int64
> dtypes: float64(3), int64(2)
> memory usage: 26.9 KB

economics.describe()

	pce	pop	psavert	uempmed	unemploy
count	574.000000	574.000000	574.000000	574.000000	574.000000
mean	4843.510453	257189.381533	7.936585	8.610105	7771.557491
std	3579.287206	36730.801593	3.124394	4.108112	2641.960571
min	507.400000	198712.000000	1.900000	4.000000	2685.000000
25%	1582.225000	224896.000000	5.500000	6.000000	6284.000000
50%	3953.550000	253060.000000	7.700000	7.500000	7494.000000
75%	7667.325000	290290.750000	10.500000	9.100000	8691.000000
max	12161.500000	320887.000000	17.000000	25.200000	15352.000000

economics.pce.mean()

result >>> 4843.510452961673

economics.pce = economics.pce * 2
economics.head()

	pce	pop	psavert	uempmed	unemploy
date
1967-07-01	1014.8	198712	12.5	4.5	2944
1967-08-01	1021.0	198911	12.5	4.7	2945
1967-09-01	1032.6	199113	11.7	4.6	2958
1967-10-01	1025.8	199311	12.5	4.9	3143
1967-11-01	1036.2	199498	12.5	4.7	3066

cs = economics.pce.cumsum()
ic(cs.head(1))
ic(cs.tail(1))

> ic> cs.head(1): date
> 1967-07-01    1014.8
> Name: pce, dtype: float64
> ic> cs.tail(1): date
> 2015-04-01    5560350.0
> Name: pce, dtype: float64

result >>> date
result >>> 2015-04-01    5560350.0
result >>> Name: pce, dtype: float64

#Math operations over the DataFrames/Series

Using aplly over column data

titanic = pd.read_csv(url_titanic, keep_default_na=True)
titanic.head(3)

	survived	pclass	sex	age	sibsp	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22.0	1	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38.0	1	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26.0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True

def my_mean(vector):
    return vector * 5

titanic.rename(columns={'fare':'1912_£'}, inplace=True)
titanic['1912_$'] = titanic['1912_£'].apply(my_mean)

titanic.loc[:, ['class', '1912_£', '1912_$']].head(5)

	class	1912_£	1912_$
0	Third	7.2500	36.2500
1	First	71.2833	356.4165
2	Third	7.9250	39.6250
3	First	53.1000	265.5000
4	Third	8.0500	40.2500

#Grouped Calculations

df = pd.read_csv(url_economics, parse_dates=['date'])
ic(df['date'].dtypes)

> ic> df['date'].dtypes: dtype('<M8[ns]')

result >>> dtype('<M8[ns]')

df.groupby('date')['pce', 'unemploy'].mean().head(10)

> /var/folders/7j/8x_gv0vs7f33q1cxc5cyhl3r0000gn/T/ipykernel_29394/3677347823.py:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
> df.groupby('date')['pce', 'unemploy'].mean().head(10)

	pce	unemploy
date
1967-07-01	507.4	2944.0
1967-08-01	510.5	2945.0
1967-09-01	516.3	2958.0
1967-10-01	512.9	3143.0
1967-11-01	518.1	3066.0
1967-12-01	525.8	3018.0
1968-01-01	531.5	2878.0
1968-02-01	534.2	3001.0
1968-03-01	544.9	2877.0
1968-04-01	544.6	2709.0

df.groupby(by=df['date'].dt.strftime('%Y'))['unemploy', 'uempmed'].mean().reset_index().head(3)

> /var/folders/7j/8x_gv0vs7f33q1cxc5cyhl3r0000gn/T/ipykernel_29394/1916814624.py:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
> df.groupby(by=df['date'].dt.strftime('%Y'))['unemploy', 'uempmed'].mean().reset_index().head(3)

	date	unemploy	uempmed
0	1967	3012.333333	4.700000
1	1968	2797.416667	4.500000
2	1969	2830.166667	4.441667

df.groupby(by=df['date'].dt.strftime('%Y'))['unemploy', 'uempmed'].first().reset_index().head(3)

> /var/folders/7j/8x_gv0vs7f33q1cxc5cyhl3r0000gn/T/ipykernel_29394/2747298252.py:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
> df.groupby(by=df['date'].dt.strftime('%Y'))['unemploy', 'uempmed'].first().reset_index().head(3)

	date	unemploy	uempmed
0	1967	2944	4.5
1	1968	2878	5.1
2	1969	2718	4.4

df.groupby(by=df['date'].dt.strftime('%Y'))['unemploy', 'uempmed'].last().reset_index().head(3)

> /var/folders/7j/8x_gv0vs7f33q1cxc5cyhl3r0000gn/T/ipykernel_29394/3124120651.py:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
> df.groupby(by=df['date'].dt.strftime('%Y'))['unemploy', 'uempmed'].last().reset_index().head(3)

	date	unemploy	uempmed
0	1967	3018	4.8
1	1968	2685	4.4
2	1969	2884	4.6

#Joins

size = 6
df_square = pd.DataFrame(rnd.randint(0, 10, size**2).reshape(size, -1), 
                  index=[f"R{x:02d}" for x in range(size)],
                  columns = [f"C{x:02d}" for x in range(size)])

df_square

	C00	C01	C02	C03	C04	C05
R00	2	6	5	7	8	4
R01	0	2	9	7	5	7
R02	8	3	0	0	9	3
R03	6	1	2	0	4	0
R04	7	0	0	1	1	5
R05	6	4	0	0	2	1

df_line = pd.Series(rnd.randint(0, 10, size), index=[f"R{x+3:02d}" for x in range(size)])
df_line.name="C06"
df_line

result >>> R03    4
result >>> R04    9
result >>> R05    5
result >>> R06    6
result >>> R07    3
result >>> R08    6
result >>> Name: C06, dtype: int64

df_square['C06'] = df_line
df_square

	C00	C01	C02	C03	C04	C05	C06
R00	2	6	5	7	8	4	NaN
R01	0	2	9	7	5	7	NaN
R02	8	3	0	0	9	3	NaN
R03	6	1	2	0	4	0	4.0
R04	7	0	0	1	1	5	9.0
R05	6	4	0	0	2	1	5.0

clean up data

df_square.drop('C06', axis=1, errors="ignore", inplace=True)

df_square.join(pd.DataFrame(df_line), how="inner")

	C00	C01	C02	C03	C04	C05	C06
R03	6	1	2	0	4	0	4
R04	7	0	0	1	1	5	9
R05	6	4	0	0	2	1	5

df_square.join(pd.DataFrame(df_line), how="outer")

	C00	C01	C02	C03	C04	C05	C06
R00	2.0	6.0	5.0	7.0	8.0	4.0	NaN
R01	0.0	2.0	9.0	7.0	5.0	7.0	NaN
R02	8.0	3.0	0.0	0.0	9.0	3.0	NaN
R03	6.0	1.0	2.0	0.0	4.0	0.0	4.0
R04	7.0	0.0	0.0	1.0	1.0	5.0	9.0
R05	6.0	4.0	0.0	0.0	2.0	1.0	5.0
R06	NaN	NaN	NaN	NaN	NaN	NaN	6.0
R07	NaN	NaN	NaN	NaN	NaN	NaN	3.0
R08	NaN	NaN	NaN	NaN	NaN	NaN	6.0

df_square.join(pd.DataFrame(df_line), how="left")

	C00	C01	C02	C03	C04	C05	C06
R00	2	6	5	7	8	4	NaN
R01	0	2	9	7	5	7	NaN
R02	8	3	0	0	9	3	NaN
R03	6	1	2	0	4	0	4.0
R04	7	0	0	1	1	5	9.0
R05	6	4	0	0	2	1	5.0

df_square.join(pd.DataFrame(df_line), how="right")

	C00	C01	C02	C03	C04	C05	C06
R03	6.0	1.0	2.0	0.0	4.0	0.0	4
R04	7.0	0.0	0.0	1.0	1.0	5.0	9
R05	6.0	4.0	0.0	0.0	2.0	1.0	5
R06	NaN	NaN	NaN	NaN	NaN	NaN	6
R07	NaN	NaN	NaN	NaN	NaN	NaN	3
R08	NaN	NaN	NaN	NaN	NaN	NaN	6

#Sorting values and indexes

economics = pd.read_csv(url_economics, index_col='date',parse_dates=True)
economics.sort_values(by='unemploy', ascending=False).head(3)

	pce	pop	psavert	uempmed	unemploy
date
2009-10-01	9924.6	308189	5.4	18.9	15352
2010-04-01	10106.9	309191	5.6	22.1	15325
2009-11-01	9946.1	308418	5.7	19.8	15219

economics.sort_index(ascending=False).head(3)

	pce	pop	psavert	uempmed	unemploy
date
2015-04-01	12158.9	320887	5.6	11.7	8549
2015-03-01	12161.5	320707	5.2	12.2	8575
2015-02-01	12095.9	320534	5.7	13.1	8705

#Time Series Analisis

economics = pd.read_csv(url_economics, index_col='date',parse_dates=True)
economics.head(2)

	pce	pop	psavert	uempmed	unemploy
date
1967-07-01	507.4	198712	12.5	4.5	2944
1967-08-01	510.5	198911	12.5	4.7	2945

economics.index

result >>> DatetimeIndex(['1967-07-01', '1967-08-01', '1967-09-01', '1967-10-01',
result >>>                '1967-11-01', '1967-12-01', '1968-01-01', '1968-02-01',
result >>>                '1968-03-01', '1968-04-01',
result >>>                ...
result >>>                '2014-07-01', '2014-08-01', '2014-09-01', '2014-10-01',
result >>>                '2014-11-01', '2014-12-01', '2015-01-01', '2015-02-01',
result >>>                '2015-03-01', '2015-04-01'],
result >>>               dtype='datetime64[ns]', name='date', length=574, freq=None)

TIME SERIES OFFSET ALIASES

ALIAS	DESCRIPTION
B	business day frequency
C	custom business day frequency (experimental)
D	calendar day frequency
W	weekly frequency
M	month end frequency
SM	semi-month end frequency (15th and end of month)
BM	business month end frequency
CBM	custom business month end frequency
MS	month start frequency
SMS	semi-month start frequency (1st and 15th)
BMS	business month start frequency
CBMS	custom business month start frequency
Q	quarter end frequency
BQ	business quarter endfrequency
QS	quarter start frequency
BQS	business quarter start frequency
A	year end frequency
BA	business year end frequency
AS	year start frequency
BAS	business year start frequency
BH	business hour frequency
H	hourly frequency
T, min	minutely frequency
S	secondly frequency
L, ms	milliseconds
U, us	microseconds
N	nanoseconds

#Time Resampling

# yearly
economics.resample(rule='A').mean().head(3)

	pce	pop	psavert	uempmed	unemploy
date
1967-12-31	515.166667	199200.333333	12.300000	4.700000	3012.333333
1968-12-31	557.458333	200663.750000	11.216667	4.500000	2797.416667
1969-12-31	604.483333	202648.666667	10.741667	4.441667	2830.166667

economics.resample(rule='A').first().head(3)

	pce	pop	psavert	uempmed	unemploy
date
1967-12-31	507.4	198712	12.5	4.5	2944
1968-12-31	531.5	199808	11.7	5.1	2878
1969-12-31	584.2	201760	10.0	4.4	2718

# custom resampling function

def total(entry):
    if len(entry):
        return entry.sum()

economics.unemploy.resample(rule='A').apply(total).head(2)

result >>> date
result >>> 1967-12-31    18074
result >>> 1968-12-31    33569
result >>> Freq: A-DEC, Name: unemploy, dtype: int64

#Time/Data Shifting

http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.shift.html

economics.unemploy.shift(1).head(2)

result >>> date
result >>> 1967-07-01       NaN
result >>> 1967-08-01    2944.0
result >>> Name: unemploy, dtype: float64

# Shift everything forward one month
economics.unemploy.shift(periods=2, freq='M').head()

result >>> date
result >>> 1967-08-31    2944
result >>> 1967-09-30    2945
result >>> 1967-10-31    2958
result >>> 1967-11-30    3143
result >>> 1967-12-31    3066
result >>> Name: unemploy, dtype: int64

#Rolling and Expanding

A common process with time series is to create data based off of a rolling mean. The idea is to divide the data into “windows” of time, and then calculate an aggregate function for each window. In this way we obtain a simple moving average.

economics.unemploy.plot(figsize=(12,6)).autoscale(axis='x',tight=True)
economics.unemploy.rolling(window=14).mean().plot()
# economics.unemploy.rolling(window=30).max().plot()
# economics.unemploy.rolling(window=30).min().plot()

Instead of calculating values for a rolling window of dates, what if you wanted to take into account everything from the start of the time series up to each point in time? For example, instead of considering the average over the last 7 days, we would consider all prior data in our expanding set of averages.

economics.unemploy.expanding(min_periods=30).mean().plot(figsize=(12,6))

#Visualization

#X-Ticks

https://matplotlib.org/api/dates_api.html

#Date Formatting

Formatting follows the Python datetime strftime codes.

The following examples are based on datetime.datetime(2001, 2, 3, 16, 5, 6) :

CODE	MEANING	EXAMPLE
`%Y`	Year with century as a decimal number	2001
`%y`	Year without century as a zero-padded decimal number	01
`%m`	Month as a zero-padded decimal number	02
`%B`	Month as locale’s full name	February
`%b`	Month as locale’s abbreviated name	Feb
`%d`	Day of the month as a zero-padded decimal number	03
`%A`	Weekday as locale’s full name	Saturday
`%a`	Weekday as locale’s abbreviated name	Sat
`%H`	Hour (24-hour clock) as a zero-padded decimal number	16
`%I`	Hour (12-hour clock) as a zero-padded decimal number	04
`%p`	Locale’s equivalent of either AM or PM	PM
`%M`	Minute as a zero-padded decimal number	05
`%S`	Second as a zero-padded decimal number	06

CODE	MEANING	EXAMPLE
`%#m`	Month as a decimal number. (Windows)	2
`%-m`	Month as a decimal number. (Mac/Linux)	2
`%#x`	Long date	Saturday, February 03, 2001
`%#c`	Long date and time	Saturday, February 03, 2001 16:05:06

from datetime import datetime
datetime(2001, 2, 3, 16, 5, 6).strftime("%A, %B %d, %Y  %I:%M:%S %p")

result >>> 'Saturday, February 03, 2001  04:05:06 PM'

from matplotlib import dates

# TODO - find better example so i can use month or weekend formatter.
# using Date Locator's

#Miscellaneous

#Loops

# series 
s = pd.Series(range(10000))
# list
l = list(s)

Panda’s Time Series

%timeit -n 1 -r 1 [s.iloc[i] for i in range(len(s))]

> 21.4 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

Python’s list

%timeit -n 1 -r 1 [l[i] for i in range(len(l))]

> 189 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

Pandas - accesing lst element

%timeit -n 1 -r 1 s.iloc[-1:]

> 53.8 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

List - accesing lst element

%timeit -n 1 -r 1 l[-1:]

> 375 ns ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

	C00	C01	C02	C03	C04	C05	C06	C07	C08	C09
R00	7	2	5	4	1	7	5	1	4	0
R01	9	5	8	0	9	2	6	3	8	2
R02	4	2	6	4	8	6	1	3	8	1
R03	9	8	9	4	1	3	6	7	2	0
R04	3	1	7	3	1	5	5	9	3	5
R05	1	9	1	9	3	7	6	8	7	4
R06	1	4	7	9	8	8	0	8	6	8
R07	7	0	7	7	2	0	7	2	2	0
R08	4	9	6	9	8	6	8	7	1	0
R09	6	6	7	4	2	7	5	2	0	2

	C00	C01	C02	C03	C04	C05	C06
R00	2	6	5	7	8	4	NaN
R01	0	2	9	7	5	7	NaN
R02	8	3	0	0	9	3	NaN
R03	6	1	2	0	4	0	4.0
R04	7	0	0	1	1	5	9.0
R05	6	4	0	0	2	1	5.0

	C00	C01	C02	C03	C04	C05	C06
R00	2.0	6.0	5.0	7.0	8.0	4.0	NaN
R01	0.0	2.0	9.0	7.0	5.0	7.0	NaN
R02	8.0	3.0	0.0	0.0	9.0	3.0	NaN
R03	6.0	1.0	2.0	0.0	4.0	0.0	4.0
R04	7.0	0.0	0.0	1.0	1.0	5.0	9.0
R05	6.0	4.0	0.0	0.0	2.0	1.0	5.0
R06	NaN	NaN	NaN	NaN	NaN	NaN	6.0
R07	NaN	NaN	NaN	NaN	NaN	NaN	3.0
R08	NaN	NaN	NaN	NaN	NaN	NaN	6.0

	C00	C01	C02	C03	C04	C05	C06
R00	2	6	5	7	8	4	NaN
R01	0	2	9	7	5	7	NaN
R02	8	3	0	0	9	3	NaN
R03	6	1	2	0	4	0	4.0
R04	7	0	0	1	1	5	9.0
R05	6	4	0	0	2	1	5.0

	C00	C01	C02	C03	C04	C05	C06
R03	6.0	1.0	2.0	0.0	4.0	0.0	4
R04	7.0	0.0	0.0	1.0	1.0	5.0	9
R05	6.0	4.0	0.0	0.0	2.0	1.0	5
R06	NaN	NaN	NaN	NaN	NaN	NaN	6
R07	NaN	NaN	NaN	NaN	NaN	NaN	3
R08	NaN	NaN	NaN	NaN	NaN	NaN	6

	C00	C01	C02	C03	C04	C05	C06	C07	C08	C09
R00	7	2	5	4	1	7	5	1	4	0
R01	9	5	8	0	9	2	6	3	8	2
R02	4	2	6	4	8	6	1	3	8	1
R03	9	8	9	4	1	3	6	7	2	0
R04	3	1	7	3	1	5	5	9	3	5
R05	1	9	1	9	3	7	6	8	7	4
R06	1	4	7	9	8	8	0	8	6	8
R07	7	0	7	7	2	0	7	2	2	0
R08	4	9	6	9	8	6	8	7	1	0
R09	6	6	7	4	2	7	5	2	0	2

	C00	C01	C02	C03	C04	C05	C06
R00	2	6	5	7	8	4	NaN
R01	0	2	9	7	5	7	NaN
R02	8	3	0	0	9	3	NaN
R03	6	1	2	0	4	0	4.0
R04	7	0	0	1	1	5	9.0
R05	6	4	0	0	2	1	5.0

	C00	C01	C02	C03	C04	C05	C06
R00	2.0	6.0	5.0	7.0	8.0	4.0	NaN
R01	0.0	2.0	9.0	7.0	5.0	7.0	NaN
R02	8.0	3.0	0.0	0.0	9.0	3.0	NaN
R03	6.0	1.0	2.0	0.0	4.0	0.0	4.0
R04	7.0	0.0	0.0	1.0	1.0	5.0	9.0
R05	6.0	4.0	0.0	0.0	2.0	1.0	5.0
R06	NaN	NaN	NaN	NaN	NaN	NaN	6.0
R07	NaN	NaN	NaN	NaN	NaN	NaN	3.0
R08	NaN	NaN	NaN	NaN	NaN	NaN	6.0

	C00	C01	C02	C03	C04	C05	C06
R00	2	6	5	7	8	4	NaN
R01	0	2	9	7	5	7	NaN
R02	8	3	0	0	9	3	NaN
R03	6	1	2	0	4	0	4.0
R04	7	0	0	1	1	5	9.0
R05	6	4	0	0	2	1	5.0

	C00	C01	C02	C03	C04	C05	C06
R03	6.0	1.0	2.0	0.0	4.0	0.0	4
R04	7.0	0.0	0.0	1.0	1.0	5.0	9
R05	6.0	4.0	0.0	0.0	2.0	1.0	5
R06	NaN	NaN	NaN	NaN	NaN	NaN	6
R07	NaN	NaN	NaN	NaN	NaN	NaN	3
R08	NaN	NaN	NaN	NaN	NaN	NaN	6