numpy

image.png

# main lib
import numpy as np

# for additional examples
import pandas as pd
import matplotlib.pyplot as plt

#Creating Numpy Arrays

  • array - Convert input data (list, tuple, array, or other sequence type) to an ndarray either by inferring a dtype or explicitly specifying a dtype. Copies the input data by default.
  • asarray - Convert input to ndarray, but do not copy if the input is already an ndarray
  • arange - Like the built-in range but returns an ndarray instead of a list.
  • ones, ones_like - Produce an array of all 1’s with the given shape and dtype. ones_like takes another array and produces a ones array of the same shape and dtype.
  • zeros, zeros_like - Like ones and ones_like but producing arrays of 0’s instead
  • empty, empty_like - Create new arrays by allocating new memory, but do not populate with any values like ones and zeros
  • full, full_like - Create new array with predefined value.
  • eye, identity - Create a square N x N identity matrix (1’s on the diagonal and 0’s elsewhere)
  • triu and tril - Upper/Lower triangle of an array.
# random array
np.random.random(10)

result >>> array([0.14381165, 0.43957196, 0.64480165, 0.29396112, 0.03620634,
result >>>        0.324091  , 0.01413331, 0.61707653, 0.55425182, 0.76508725])
np.random.random((10, 10))

result >>> array([[0.84814059, 0.49366146, 0.07357296, 0.86473168, 0.90994484,
result >>>         0.59937738, 0.81070949, 0.95118411, 0.26313043, 0.40428545],
result >>>        [0.06764683, 0.31008572, 0.48085707, 0.3123668 , 0.99413403,
result >>>         0.61140935, 0.27311947, 0.46530946, 0.35310464, 0.49019337],
result >>>        [0.19613178, 0.65399376, 0.77925913, 0.3325733 , 0.0194955 ,
result >>>         0.58361935, 0.18698444, 0.56680306, 0.40493511, 0.29486893],
result >>>        [0.17297747, 0.22939305, 0.76750417, 0.67280111, 0.40667287,
result >>>         0.2769071 , 0.49143694, 0.61032024, 0.27884701, 0.28583778],
result >>>        [0.34889296, 0.24250182, 0.50627885, 0.13695109, 0.58947309,
result >>>         0.37040892, 0.05495742, 0.19997462, 0.40929128, 0.09534295],
result >>>        [0.91114241, 0.43814007, 0.54084854, 0.48690268, 0.79957151,
result >>>         0.60292787, 0.04642761, 0.19954316, 0.55384688, 0.87143598],
result >>>        [0.19789552, 0.92630145, 0.7875622 , 0.52719853, 0.35621211,
result >>>         0.15969174, 0.13203917, 0.86605477, 0.22833789, 0.88161008],
result >>>        [0.6079528 , 0.02602084, 0.28829024, 0.25180093, 0.93785455,
result >>>         0.35431702, 0.19532462, 0.63491947, 0.94570596, 0.07221944],
result >>>        [0.35733435, 0.31192497, 0.62450735, 0.65710509, 0.06752447,
result >>>         0.4298031 , 0.57751011, 0.62875046, 0.22889312, 0.55232044],
result >>>        [0.43244427, 0.57533423, 0.51541704, 0.77377705, 0.45161832,
result >>>         0.47569954, 0.78782687, 0.60993249, 0.57680231, 0.97852437]])
# transformation
(np.random.random((10, 10))*10).astype(int)

result >>> array([[7, 1, 3, 3, 0, 6, 9, 4, 0, 9],
result >>>        [9, 3, 0, 9, 7, 8, 1, 7, 5, 3],
result >>>        [9, 9, 0, 1, 9, 2, 5, 8, 5, 0],
result >>>        [9, 3, 4, 0, 5, 9, 1, 6, 8, 9],
result >>>        [3, 9, 2, 9, 6, 7, 3, 1, 3, 1],
result >>>        [3, 8, 9, 7, 5, 4, 2, 1, 1, 4],
result >>>        [5, 9, 9, 0, 9, 1, 5, 6, 6, 3],
result >>>        [4, 8, 7, 0, 1, 5, 8, 3, 0, 7],
result >>>        [5, 8, 3, 2, 2, 9, 2, 2, 6, 4],
result >>>        [0, 1, 2, 9, 5, 4, 5, 3, 2, 6]])
# 10 elements from 0 to 5
np.linspace(0, 5, 10)

result >>> array([0.        , 0.55555556, 1.11111111, 1.66666667, 2.22222222,
result >>>        2.77777778, 3.33333333, 3.88888889, 4.44444444, 5.        ])
# Regular python list
np.array([1,2,3,4])

result >>> array([1, 2, 3, 4])

NumPy offers several functions to create arrays with initial placeholder content Create an array of zeros with desired shape (x,y)

  • x == number of rows
  • y == number of columns in array
np.zeros((3,4))

result >>> array([[0., 0., 0., 0.],
result >>>        [0., 0., 0., 0.],
result >>>        [0., 0., 0., 0.]])
np.ones((3,4))

result >>> array([[1., 1., 1., 1.],
result >>>        [1., 1., 1., 1.],
result >>>        [1., 1., 1., 1.]])
# Using dtype in order to specify the data type
np.ones((3,4),dtype=np.int16)

result >>> array([[1, 1, 1, 1],
result >>>        [1, 1, 1, 1],
result >>>        [1, 1, 1, 1]], dtype=int16)
# np.empty() is not the same as np.zeros()
np.empty((2,3))

result >>> array([[0., 0., 0.],
result >>>        [0., 0., 0.]])
# we also can specify type
np.full((2,3), 4, dtype=np.double)

result >>> array([[4., 4., 4.],
result >>>        [4., 4., 4.]])
# np.eye() creates an eyedentity matrix
np.eye(3)

result >>> array([[1., 0., 0.],
result >>>        [0., 1., 0.],
result >>>        [0., 0., 1.]])
# `*_like` will use input array as example of shape

np.full_like(np.zeros((2,4)), np.nan)

result >>> array([[nan, nan, nan, nan],
result >>>        [nan, nan, nan, nan]])

To create sequences of numbers, NumPy provides a function analogous to range that returns arrays instead of lists

arange(start, stop, step, dtype)

np.arange(2, 20, 2, dtype=float)

result >>> array([ 2.,  4.,  6.,  8., 10., 12., 14., 16., 18.])
np.arange(2, 20, 2)

result >>> array([ 2,  4,  6,  8, 10, 12, 14, 16, 18])
array_2d = np.array([(2,4,6),(3,5,7)]) 
array_2d

result >>> array([[2, 4, 6],
result >>>        [3, 5, 7]])
array_2d.shape

result >>> (2, 3)
# Using reshape to create n dimensional arrays
np.arange(6).reshape(3,2)

result >>> array([[0, 1],
result >>>        [2, 3],
result >>>        [4, 5]])

The reshape will only take arguments that multiply to the number in arange function. For example: for arange(8), the possible combinations for reshape are (2,4), (4,2), (2,2,2)

np.arange(6).reshape(6,1)

result >>> array([[0],
result >>>        [1],
result >>>        [2],
result >>>        [3],
result >>>        [4],
result >>>        [5]])
np.zeros_like(np.arange(6).reshape(6,1))

result >>> array([[0],
result >>>        [0],
result >>>        [0],
result >>>        [0],
result >>>        [0],
result >>>        [0]])
np.ones_like(np.arange(6).reshape(6,1))

result >>> array([[1],
result >>>        [1],
result >>>        [1],
result >>>        [1],
result >>>        [1],
result >>>        [1]])
np.triu((np.random.random((5, 5))*10).astype(int))

result >>> array([[9, 9, 0, 7, 0],
result >>>        [0, 5, 6, 1, 3],
result >>>        [0, 0, 8, 9, 0],
result >>>        [0, 0, 0, 8, 4],
result >>>        [0, 0, 0, 0, 0]])
np.triu((np.random.random((5, 5))*10).astype(int), 1)

result >>> array([[0, 0, 5, 6, 1],
result >>>        [0, 0, 1, 9, 0],
result >>>        [0, 0, 0, 2, 4],
result >>>        [0, 0, 0, 0, 4],
result >>>        [0, 0, 0, 0, 0]])
np.tril((np.random.random((5, 5))*10).astype(int))

result >>> array([[6, 0, 0, 0, 0],
result >>>        [4, 0, 0, 0, 0],
result >>>        [5, 6, 3, 0, 0],
result >>>        [8, 2, 0, 7, 0],
result >>>        [2, 0, 1, 5, 7]])
np.tril((np.random.random((5, 5))*10).astype(int), 1)

result >>> array([[4, 5, 0, 0, 0],
result >>>        [5, 6, 4, 0, 0],
result >>>        [7, 3, 5, 7, 0],
result >>>        [7, 9, 6, 0, 0],
result >>>        [0, 6, 4, 0, 0]])

#Printitng arrays

print(np.arange(10000).reshape(100,100))

> [[   0    1    2 ...   97   98   99]
> [ 100  101  102 ...  197  198  199]
> [ 200  201  202 ...  297  298  299]
> ...
> [9700 9701 9702 ... 9797 9798 9799]
> [9800 9801 9802 ... 9897 9898 9899]
> [9900 9901 9902 ... 9997 9998 9999]]
np.set_printoptions(threshold = 1000)
print(np.arange(10000).reshape(100,100))

> [[   0    1    2 ...   97   98   99]
> [ 100  101  102 ...  197  198  199]
> [ 200  201  202 ...  297  298  299]
> ...
> [9700 9701 9702 ... 9797 9798 9799]
> [9800 9801 9802 ... 9897 9898 9899]
> [9900 9901 9902 ... 9997 9998 9999]]

#Arithmetic Operations

If the dimensions of two arrays are dissimilar, element-to-element operations are not possible. However, operations on arrays of non-similar shapes is still possible in NumPy, because of the broadcasting capability.

#Elementwise operations

a = np.array([10,10,10])
b = np.array([5,5,5])
a + b

result >>> array([15, 15, 15])
a - b

result >>> array([5, 5, 5])
a * b

result >>> array([50, 50, 50])
a / b

result >>> array([2., 2., 2.])
a % 3

result >>> array([1, 1, 1])
a < 35

result >>> array([ True,  True,  True])
a > 25

result >>> array([False, False, False])
a ** 2

result >>> array([100, 100, 100])

#dot function or method

A = np.array( [[1,1], [0,1]] )
B = np.array( [[2,1], [3,4]] )

print('A:\n', A)
print('B:\n', B)

> A:
> [[1 1]
> [0 1]]
> B:
> [[2 1]
> [3 4]]
# This gives the matrix multiplication
A.dot(B)

result >>> array([[5, 5],
result >>>        [3, 4]])
np.dot(A,B)

result >>> array([[5, 5],
result >>>        [3, 4]])

#in-place operations

# Modifying an existing array rather than create a new one
a  *= 3
a

result >>> array([30, 30, 30])
b += a
b

result >>> array([35, 35, 35])
### Unary Operators
ages = np.array([12,15,18,20])
ages.sum()

result >>> 65
ages.min()

result >>> 12
ages.max()

result >>> 20

By default, these operations apply to the array as though it were a list of numbers, regardless of its shape. However, by specifying the axis parameter you can apply an operation along the specified axis of an array

numbers = np.arange(12).reshape(3,4)
numbers

result >>> array([[ 0,  1,  2,  3],
result >>>        [ 4,  5,  6,  7],
result >>>        [ 8,  9, 10, 11]])

Row and column operations In a 2D array axis #0 represents columns. Axis #1 refers to rows

# Sum up each column
numbers.sum(axis=0)

result >>> array([12, 15, 18, 21])
# Sum up each row
numbers.sum(axis=1)

result >>> array([ 6, 22, 38])
# Minimum of each row
numbers.min(axis=1)

result >>> array([0, 4, 8])

#Input and Output

#File System

a = np.array(range(10))
b = a * 3
# save array in txt uncompressed
np.savetxt("np_a.txt", a)
print(np.loadtxt('np_a.txt'))

> [0. 1. 2. 3. 4. 5. 6. 7. 8. 9.]
# save in binary format
np.save("np_a", a)
print(np.load('np_a.npy'))

> [0 1 2 3 4 5 6 7 8 9]
# save few arrays
np.savez("np_ab", a=a, b=b)
# loadign data from uncompressed file
data = np.load('np_ab.npz')
print(data['a'])

> [0 1 2 3 4 5 6 7 8 9]
# save few arrays compressed
np.savez_compressed("np_ab_z", a=a, b=b)
data = np.load('np_ab_z.npz')
print(data['a'])

> [0 1 2 3 4 5 6 7 8 9]
!ls np*

> np_a.npy  np_a.txt  np_ab.npz  np_ab_z.npz
#> unlink np_a.npy
#> unlink np_a.txt
#> unlink np_ab.npz
#> unlink np_ab_z.npz

#Memory

https://docs.scipy.org/doc/numpy/reference/generated/numpy.memmap.html#numpy.memmap

from tempfile import mkdtemp
import os.path as path
filename = path.join(mkdtemp(), 'newfile.dat')
fp = np.memmap(filename, dtype='float32', mode='w+', shape=(3,4))
fp

result >>> memmap([[0., 0., 0., 0.],
result >>>         [0., 0., 0., 0.],
result >>>         [0., 0., 0., 0.]], dtype=float32)
data = np.arange(12, dtype='float32')
data.resize((3,4))
fp[:] = data[:]
fp

result >>> memmap([[ 0.,  1.,  2.,  3.],
result >>>         [ 4.,  5.,  6.,  7.],
result >>>         [ 8.,  9., 10., 11.]], dtype=float32)
fp.filename == path.abspath(filename)
fp.filename

result >>> '/var/folders/7j/8x_gv0vs7f33q1cxc5cyhl3r0000gn/T/tmpllvrg39n/newfile.dat'
del fp

#Universal Functions

#Trigonometric Functions

angles = np.array([0,30,45,60,90])

Angles need to be converted to radians by multiplying by pi/180 Only then can we appy trigonometric functions to our array.

angles_radians = angles * np.pi/180
angles_radians

result >>> array([0.        , 0.52359878, 0.78539816, 1.04719755, 1.57079633])
# Sine of angles in the array:
np.sin(angles_radians)

result >>> array([0.        , 0.5       , 0.70710678, 0.8660254 , 1.        ])
## Alternatively, use the np.radians() function to convert to radians
angles_radians = np.radians(angles)
angles_radians

result >>> array([0.        , 0.52359878, 0.78539816, 1.04719755, 1.57079633])
# Cosine of angles in the array:
np.cos(angles_radians)

result >>> array([1.00000000e+00, 8.66025404e-01, 7.07106781e-01, 5.00000000e-01,
result >>>        6.12323400e-17])
# Tangent of angles in the array
np.tan(angles_radians)

result >>> array([0.00000000e+00, 5.77350269e-01, 1.00000000e+00, 1.73205081e+00,
result >>>        1.63312394e+16])

arcsin, arcos, and arctan functions return the trigonometric inverse of sin, cos, and tan of the given angle. The result of these functions can be verified by numpy.degrees() function by converting radians to degrees.

# Compute sine inverse of angles. Returned values are in radians.

np.arcsin(np.sin(angles * np.pi/180) )

result >>> array([0.        , 0.52359878, 0.78539816, 1.04719755, 1.57079633])
# np.degrees() converts radians to degrees
np.degrees(np.arcsin(np.sin(angles * np.pi/180) ) )

result >>> array([ 0., 30., 45., 60., 90.])

#Statistical Functions

test_scores = np.array([32.32, 56.98, 21.52, 44.32, 
                        55.63, 13.75, 43.47, 43.34])
# Mean test scores of the students: 
np.mean(test_scores)

result >>> 38.91625
# Median
np.median(test_scores)

result >>> 43.405
# We will now perform basic statistical methods on real 
# life dataset. We will use salary data of 1147 European developers.

url = "https://gist.githubusercontent.com/butuzov/ed1c7f8f3affe6dd005c1ee40dc3f7f2/raw/9012bdf4a623aa10e5abe3932028f79609337c19/sallary.csv"
salaries = np.genfromtxt(url, delimiter=',')
salaries

result >>> array([60000., 58000., 56967., ..., 54647., 25000., 70000.])
salaries.shape

result >>> (1147,)
print(f"Mean = {np.mean(salaries)}")
print(f"Median = {np.median(salaries)}")
print(f"Standard Deviation = {np.std(salaries)}")
print(f"Variance = {np.var(salaries)}")

> Mean = 55894.53879686138
> Median = 48000.0
> Standard Deviation = 55170.37550939316
> Variance = 3043770333.8474483

#Indexing and Slicing

This also follows zero based indexing like python lists

a = np.arange(11)**2
a

result >>> array([  0,   1,   4,   9,  16,  25,  36,  49,  64,  81, 100])
a[2]

result >>> 4
a[-2]

result >>> 81
a[2:7]

result >>> array([ 4,  9, 16, 25, 36])
a[2:-2]

result >>> array([ 4,  9, 16, 25, 36, 49, 64])
a[:7]

result >>> array([ 0,  1,  4,  9, 16, 25, 36])
a[:11:2]

result >>> array([  0,   4,  16,  36,  64, 100])
a[::-1]

result >>> array([100,  81,  64,  49,  36,  25,  16,   9,   4,   1,   0])

#2D Array indexing

Consider an array students, it contains the test scores in two courses of the students against their names

students = np.array([['Alice','Beth','Cathy','Dorothy'],
                     [65,78,90,81],
                     [71,82,79,92]])
students

result >>> array([['Alice', 'Beth', 'Cathy', 'Dorothy'],
result >>>        ['65', '78', '90', '81'],
result >>>        ['71', '82', '79', '92']], dtype='<U21')
students[0]

result >>> array(['Alice', 'Beth', 'Cathy', 'Dorothy'], dtype='<U21')
students[1]

result >>> array(['65', '78', '90', '81'], dtype='<U21')
students[2]

result >>> array(['71', '82', '79', '92'], dtype='<U21')
students[0,1]

result >>> 'Beth'
# get data using method
students.item((0,1))

result >>> 'Beth'
# set data using method
students.itemset((0,1), 'Michel')
students

result >>> array([['Alice', 'Michel', 'Cathy', 'Dorothy'],
result >>>        ['65', '78', '90', '81'],
result >>>        ['71', '82', '79', '92']], dtype='<U21')

#Searching in arrays

a = np.arange(16).reshape(4,4)**2
a

result >>> array([[  0,   1,   4,   9],
result >>>        [ 16,  25,  36,  49],
result >>>        [ 64,  81, 100, 121],
result >>>        [144, 169, 196, 225]])
np.where(a > 50)

result >>> (array([2, 2, 2, 2, 3, 3, 3, 3]), array([0, 1, 2, 3, 0, 1, 2, 3]))
np.argwhere(a > 50)

result >>> array([[2, 0],
result >>>        [2, 1],
result >>>        [2, 2],
result >>>        [2, 3],
result >>>        [3, 0],
result >>>        [3, 1],
result >>>        [3, 2],
result >>>        [3, 3]])

#2D Array slicing

This will consider the rows 0 and 1, columns 2 and 3

students[0:2,2:4]

result >>> array([['Cathy', 'Dorothy'],
result >>>        ['90', '81']], dtype='<U21')
# all rows and columns 1
students[:,1:2]

result >>> array([['Michel'],
result >>>        ['78'],
result >>>        ['82']], dtype='<U21')
# all rows and 1,2 columns
students[:,1:3]

result >>> array([['Michel', 'Cathy'],
result >>>        ['78', '90'],
result >>>        ['82', '79']], dtype='<U21')
# All columns, rows 0 and 1
students[0:2,:]

result >>> array([['Alice', 'Michel', 'Cathy', 'Dorothy'],
result >>>        ['65', '78', '90', '81']], dtype='<U21')
# All rows and columns
students[:]

result >>> array([['Alice', 'Michel', 'Cathy', 'Dorothy'],
result >>>        ['65', '78', '90', '81'],
result >>>        ['71', '82', '79', '92']], dtype='<U21')
#The last row
students[-1,:]

result >>> array(['71', '82', '79', '92'], dtype='<U21')
#3rd from last to second from last row, last two columns
students[-3:-1,-2:]

result >>> array([['Cathy', 'Dorothy'],
result >>>        ['90', '81']], dtype='<U21')

#dots or ellipsis(…)

Slicing can also include ellipsis (…) to make a selection tuple of the same length as the dimension of an array. The dots (…) represent as many colons as needed to produce a complete indexing tuple

students[0,...]

result >>> array(['Alice', 'Michel', 'Cathy', 'Dorothy'], dtype='<U21')
# all rows and column 1
students[...,1]

result >>> array(['Michel', '78', '82'], dtype='<U21')
students[...,1].shape

result >>> (3,)

#Iteratition

#1D Arrays

a = np.arange(11)**2
a

result >>> array([  0,   1,   4,   9,  16,  25,  36,  49,  64,  81, 100])
for i in a:
    print(i**(1/2))

> 0.0
> 1.0
> 2.0
> 3.0
> 4.0
> 5.0
> 6.0
> 7.0
> 8.0
> 9.0
> 10.0

#Multi-Dimensional Arrays

students = np.array([['Alice','Beth','Cathy','Dorothy'],
                     [65,78,90,81],
                     [71,82,79,92]])
# Each iteration will be over the rows of the array
for i in students:
    print('i = ', i)

> i =  ['Alice' 'Beth' 'Cathy' 'Dorothy']
> i =  ['65' '78' '90' '81']
> i =  ['71' '82' '79' '92']

Flatten a multi-dimensional array

# If one wants to perform an operation on each element in the array, 
# one can use the flatten function which will flatten the array to a 
# single dimension. 'By default', the flattening will occur `row-wise` 
# (also knows as C order)

for element in students.flatten():
    print(element)

> Alice
> Beth
> Cathy
> Dorothy
> 65
> 78
> 90
> 81
> 71
> 82
> 79
> 92
# To flatten a 2D array column-wise, 
# use the Fortran order
for element in students.flatten(order='F'):
    print(element)

> Alice
> 65
> 71
> Beth
> 78
> 82
> Cathy
> 90
> 79
> Dorothy
> 81
> 92
# nditer
# Efficient multi-dimensional iterator object to iterate over arrays
x = np.arange(12).reshape(3,4)
x

result >>> array([[ 0,  1,  2,  3],
result >>>        [ 4,  5,  6,  7],
result >>>        [ 8,  9, 10, 11]])
# This is row-wise iteration, similar to iterating over a C-order flattened array
for i in np.nditer(x):
    print(i)

> 0
> 1
> 2
> 3
> 4
> 5
> 6
> 7
> 8
> 9
> 10
> 11
# Fortran order
# This is like iterating over an array which has been flattened column-wise

for i in np.nditer(x, order = 'F'): 
    print(i)

> 0
> 4
> 8
> 1
> 5
> 9
> 2
> 6
> 10
> 3
> 7
> 11
# There are a number of flags which we can pass as a list to nditer. 
# Many of these involve setting buffering options <br />
# If we want iterate over each column, we can use the flag argument with value 'external_loop'

for i in np.nditer(x, order = 'F', flags = ['external_loop']): 
    print(i)

> [0 4 8]
> [1 5 9]
> [ 2  6 10]
> [ 3  7 11]

#Modifying Array Values

By default, the nditer treats the input array as a read-only object. To modify the array elements, you must specify either read-write or write-only mode. This is controlled with per-operand flags.

# will trhoud value error
try:
    for arr in np.nditer(x):
        arr[...] = arr * arr
except ValueError as e:
    print("Error:", e)
finally:
    print(arr)

> Error: assignment destination is read-only
> 0
# We set the ops_flag to make the array read-write
try:
    for arr in np.nditer(x, op_flags = ['readwrite']):
        arr[...] = arr * arr
except ValueError as e:
    print("Error:", e)
finally:
    print(arr)

> 121

#Array Shape Manipulation

a = np.array([("Germany","France", "Hungary","Austria"),
              ("Berlin","Paris", "Budapest","Vienna" )])
a

result >>> array([['Germany', 'France', 'Hungary', 'Austria'],
result >>>        ['Berlin', 'Paris', 'Budapest', 'Vienna']], dtype='<U8')
a.shape

result >>> (2, 4)

The ravel() function

The primary functional difference is that flatten is a method of an ndarray object and hence can only be called for true numpy arrays. In contrast ravel() is a library-level function and hence can be called on any object that can successfully be parsed. For example ravel() will work on a list of ndarrays, while flatten will not.

a.ravel()

result >>> array(['Germany', 'France', 'Hungary', 'Austria', 'Berlin', 'Paris',
result >>>        'Budapest', 'Vienna'], dtype='<U8')
# transposing array
a.T

result >>> array([['Germany', 'Berlin'],
result >>>        ['France', 'Paris'],
result >>>        ['Hungary', 'Budapest'],
result >>>        ['Austria', 'Vienna']], dtype='<U8')
a.T.ravel()

result >>> array(['Germany', 'Berlin', 'France', 'Paris', 'Hungary', 'Budapest',
result >>>        'Austria', 'Vienna'], dtype='<U8')

reshape() gives a new shape to an array without changing its data.

a.shape

result >>> (2, 4)
a.reshape(4,2)

result >>> array([['Germany', 'France'],
result >>>        ['Hungary', 'Austria'],
result >>>        ['Berlin', 'Paris'],
result >>>        ['Budapest', 'Vienna']], dtype='<U8')
np.arange(15).reshape(3,5)

result >>> array([[ 0,  1,  2,  3,  4],
result >>>        [ 5,  6,  7,  8,  9],
result >>>        [10, 11, 12, 13, 14]])
# Reshaping a 15-element array to an 18-element one will throw an error
try:
    np.arange(15).reshape(3,6)
except ValueError as e:
    print("Error:", e)

> Error: cannot reshape array of size 15 into shape (3,6)
# Specify only one dimension (and infer the others) when reshaping
# Another way we can reshape is by metioning only one dimension, and -1. 
# -1 means that the length in that dimension is inferred
countries = np.array(["Germany","France", "Hungary","Austria","Italy","Denmark"])
countries

result >>> array(['Germany', 'France', 'Hungary', 'Austria', 'Italy', 'Denmark'],
result >>>       dtype='<U7')
countries.reshape(-1,3)

result >>> array([['Germany', 'France', 'Hungary'],
result >>>        ['Austria', 'Italy', 'Denmark']], dtype='<U7')
countries.reshape(3,-1)

result >>> array([['Germany', 'France'],
result >>>        ['Hungary', 'Austria'],
result >>>        ['Italy', 'Denmark']], dtype='<U7')
countries.reshape(3,2)

result >>> array([['Germany', 'France'],
result >>>        ['Hungary', 'Austria'],
result >>>        ['Italy', 'Denmark']], dtype='<U7')

#Splitting Arrays

split() an array into multiple sub-arrays. By specifying the number of equally shaped arrays to return, or by specifying the columns after which the division should occur

split(array, indices_or_sections, axis=0)

x = np.arange(9)
x

result >>> array([0, 1, 2, 3, 4, 5, 6, 7, 8])
# Split the array in 3 equal-sized subarrays:
np.split(x, 3)

result >>> [array([0, 1, 2]), array([3, 4, 5]), array([6, 7, 8])]
# The number of splits must be a divisor of the number of elements
# Or Numpy will complain that an even split is not possible

try:
    np.split(x, 4)
except TypeError as e:
    print("Error:", e)
except ValueError as e:
    print("Error:", e)

> Error: array split does not result in an equal division
# Split the array at positions indicated in 1-D array:
np.split(x,[4,7])

result >>> [array([0, 1, 2, 3]), array([4, 5, 6]), array([7, 8])]

hsplit. The numpy.hsplit is a special case of split() function where axis is 1 indicating a horizontal split regardless of the dimension of the input array.
In this example, the split will be performed along a column

y = np.array([("Germany","France", "Hungary","Austria"),
              ("Berlin","Paris", "Budapest","Vienna" )])
y

result >>> array([['Germany', 'France', 'Hungary', 'Austria'],
result >>>        ['Berlin', 'Paris', 'Budapest', 'Vienna']], dtype='<U8')
p1, p2 = np.hsplit(y, 2)
p1

result >>> array([['Germany', 'France'],
result >>>        ['Berlin', 'Paris']], dtype='<U8')
p2

result >>> array([['Hungary', 'Austria'],
result >>>        ['Budapest', 'Vienna']], dtype='<U8')
np.hsplit(y,4)

result >>> [array([['Germany'],
result >>>         ['Berlin']], dtype='<U8'),
result >>>  array([['France'],
result >>>         ['Paris']], dtype='<U8'),
result >>>  array([['Hungary'],
result >>>         ['Budapest']], dtype='<U8'),
result >>>  array([['Austria'],
result >>>         ['Vienna']], dtype='<U8')]
try:
    np.hsplit(y,3)
except TypeError as e:
    print("Error:", e)
except ValueError as e:
    print("Error:", e)

> Error: array split does not result in an equal division

vsplit splits along the vertical axis

p_1,p_2 = np.vsplit(y, 2)
print(p_1, p_2)

> [['Germany' 'France' 'Hungary' 'Austria']] [['Berlin' 'Paris' 'Budapest' 'Vienna']]

An alternative approach is array unpacking. In this example, we unpack the array into two variables. The array unpacks by row i.e Unpacking “unpacks” the first dimensions of an array

countries,capitals = y
print('Countries: ', countries)
print('Capitals: ' , capitals)

> Countries:  ['Germany' 'France' 'Hungary' 'Austria']
> Capitals:  ['Berlin' 'Paris' 'Budapest' 'Vienna']
b1,b2,b3,b4 = y.T
print(b1,b2,b3,b4)

> ['Germany' 'Berlin'] ['France' 'Paris'] ['Hungary' 'Budapest'] ['Austria' 'Vienna']

We can not use the following code, reason being the first dimension of array now contains 4 rows. If we want to split in 2 arrays horizontally we should use split or hsplit.

try:
    z1,z2 = y.T
except TypeError as e:
    print("Error:", e)
except ValueError as e:
    print("Error:", e)

> Error: too many values to unpack (expected 2)

#View vs Copy

  • When the contents are physically stored in another location, it is called Copy.
  • On the other hand, a different view of the same memory content is provided, we call it as View.
  • Different array objects can share the same data. NumPy has ndarray.view() method which is a new array object that looks at the same data of the original array.
fruits = np.array(["Apple","Mango","Grapes","Watermelon"])
basket_1 = fruits.view()
basket_2 = fruits.view()
print(basket_1)
print(basket_2)

> ['Apple' 'Mango' 'Grapes' 'Watermelon']
> ['Apple' 'Mango' 'Grapes' 'Watermelon']
print("ids for the arrays are different.")
print("id for fruits is : ", id(fruits))
print("id for baskets is : ", id(basket_1), id(basket_2))

> ids for the arrays are different.
> id for fruits is :  5873979664
> id for baskets is :  5873630160 5874177520
basket_1 is fruits

result >>> False
basket_1.base is fruits

result >>> True
# Change a few elements of basket. It changes the elements of fruits
# Here, we assign a new value to the first element of basket_2. You 
# might be astonished that the list of fruits has been "automatically" 
# changed as well. The explanation is that there has been no new assignment 
# to basket_2, only to one of its elements.
basket_2[0] = "Strawberry"
basket_2

result >>> array(['Strawberry', 'Mango', 'Grapes', 'Watermelon'], dtype='<U10')
fruits

result >>> array(['Strawberry', 'Mango', 'Grapes', 'Watermelon'], dtype='<U10')
basket_1

result >>> array(['Strawberry', 'Mango', 'Grapes', 'Watermelon'], dtype='<U10')
# Change the entire elements of basket. It does not change fruits
basket_1 = np.array(["Peach","Pineapple","Banana","Orange"])

# new np array
basket_1

result >>> array(['Peach', 'Pineapple', 'Banana', 'Orange'], dtype='<U9')
fruits

result >>> array(['Strawberry', 'Mango', 'Grapes', 'Watermelon'], dtype='<U10')
# Change the shape of basket. It does not change the shape of fruits
basket_2.shape = 2,2
basket_2

result >>> array([['Strawberry', 'Mango'],
result >>>        ['Grapes', 'Watermelon']], dtype='<U10')
# fruts are unchanged
fruits

result >>> array(['Strawberry', 'Mango', 'Grapes', 'Watermelon'], dtype='<U10')
# Slicing an array returns a view of it
mini_basket = fruits[2:]
mini_basket

result >>> array(['Grapes', 'Watermelon'], dtype='<U10')
fruits[3] = "Peach"
mini_basket

result >>> array(['Grapes', 'Peach'], dtype='<U10')

#Deep Copy

We now Create a deep copy of fruits

fruits = np.array(["Apple","Mango","Grapes","Watermelon"])
basket = fruits.copy()
basket is fruits

result >>> False
basket [0] = "Strawberry"
basket

result >>> array(['Strawberry', 'Mango', 'Grapes', 'Watermelon'], dtype='<U10')
fruits

result >>> array(['Apple', 'Mango', 'Grapes', 'Watermelon'], dtype='<U10')
basket.shape = 2,2
# Shape of basket: 
basket

result >>> array([['Strawberry', 'Mango'],
result >>>        ['Grapes', 'Watermelon']], dtype='<U10')
# Shape of fruits: 
fruits

result >>> array(['Apple', 'Mango', 'Grapes', 'Watermelon'], dtype='<U10')

#Indexing - Integer Arrays

NumPy arrays can be indexed with slices, but also with boolean or integer arrays (masks). It means passing an array of indices to access multiple array elements at once. This method is called fancy indexing. It creates copies not views.

a = np.arange(12)**2
a

result >>> array([  0,   1,   4,   9,  16,  25,  36,  49,  64,  81, 100, 121])

Suppose we want to access three different elements. We could do it like this:

a[2],a[6],a[8]

result >>> (4, 36, 64)

Alternatively, we can pass a single list or array of indices to obtain the same result:

indx_1 = [2,6,8]
a[indx_1]

result >>> array([ 4, 36, 64])

When using fancy indexing, the shape of the result reflects the shape of the index arrays rather than the shape of the array being indexed

indx_2 = np.array([[2,4],[8,10]])
a[indx_2]

result >>> array([[  4,  16],
result >>>        [ 64, 100]])
food = np.array([["blueberry","strawberry","cherry","blackberry"],
                 ["pinenut","hazelnuts","cashewnut","coconut"],
                 ["mustard","paprika","nutmeg","clove"]])
food

result >>> array([['blueberry', 'strawberry', 'cherry', 'blackberry'],
result >>>        ['pinenut', 'hazelnuts', 'cashewnut', 'coconut'],
result >>>        ['mustard', 'paprika', 'nutmeg', 'clove']], dtype='<U10')
# We will now select the corner elements of this array

row = np.array([[0,0],[2,2]])
col = np.array([[0,3],[0,3]])

food[row,col]

result >>> array([['blueberry', 'blackberry'],
result >>>        ['mustard', 'clove']], dtype='<U10')
# Notice that the first value in the result is food[0,0], next is food[0,3] , food[2,0] and lastly food[2,3]
food[2,0]

result >>> 'mustard'

Modifying Values with Fancy Indexing

food[row,col] = "000000"
food

result >>> array([['000000', 'strawberry', 'cherry', '000000'],
result >>>        ['pinenut', 'hazelnuts', 'cashewnut', 'coconut'],
result >>>        ['000000', 'paprika', 'nutmeg', '000000']], dtype='<U10')
a

result >>> array([  0,   1,   4,   9,  16,  25,  36,  49,  64,  81, 100, 121])
a[indx_1] = 999
a

result >>> array([  0,   1, 999,   9,  16,  25, 999,  49, 999,  81, 100, 121])

#Pandas Example

We will load the GDP per capita (in US dollars) data for all the countries in the world for the year 2016 into a variable using pandas.

GDP per capita (current US$)

GDP per capita is gross domestic product divided by midyear population. GDP is the sum of gross value added by all resident producers in the economy plus any product taxes and minus any subsidies not included in the value of the products. It is calculated without making deductions for depreciation of fabricated assets or for depletion and degradation of natural resources. Data are in current U.S. dollars.

url = "https://gist.githubusercontent.com/butuzov/ed1c7f8f3affe6dd005c1ee40dc3f7f2/raw/3ecb105e9a42abd7ddd1832d076442f094e7ef99/gdp_pc.csv"
gdp_16 = pd.read_csv(url)["2016"].values
type(gdp_16)

result >>> numpy.ndarray
gdp_16.shape

result >>> (264,)
plt.plot(gdp_16)
plt.show()

np.median(gdp_16)

result >>> nan
gdp_16

result >>> array([            nan,   1944.11700491,   6454.13537039,  11540.02556108,
result >>>                    nan,  16726.72218488,  72399.65347339,  19939.93077479,
result >>>          8832.76343477,             nan,  22661.48853616,  46012.32845157,
result >>>         50551.5531752 ,  17256.62697044,    777.75285166,  46428.67142458,
result >>>          2167.64282343,   1771.01528722,   3579.75679135,  19242.62261574,
result >>>                    nan,  22516.81648228,  12172.06646607,  18066.3006624 ,
result >>>          8461.53565294,             nan,   7234.19524335,  15123.85001428,
result >>>         18064.60442722,  77420.61217204,   8900.76455309,  16956.72477025,
result >>>           698.70665573,  44819.48360027,  26851.1685408 ,  63888.73238665,
result >>>                    nan,  23193.97411052,  15529.08410645,   3693.43690912,
result >>>          3609.37559695,    801.63011974,   5717.29037095,  14153.92792519,
result >>>          1521.85721808,   6551.31894829,  16609.73935418,  15370.63489842,
result >>>                    nan,             nan,             nan,  32707.872887  ,
result >>>         34749.21236327,  48860.52529211,             nan,  10947.71518781,
result >>>         49029.01483891,  15204.93246446,  15013.29637986,  13839.96863584,
result >>>          9175.64442097,  17025.39067931,  19516.08298373,  31366.86706838,
result >>>         11242.04751604,  11128.80247428,  42063.79406454,             nan,
result >>>         36304.85427275,  29743.33742802,   1734.46407452,  39610.86680734,
result >>>          4254.34870787,  43378.14602882,   9109.95196333,  41343.29253554,
result >>>                    nan,   3508.71519196,  18102.86221919,  42656.2166022 ,
result >>>         10004.5284119 ,   4292.44889992,             nan,   1966.37968826,
result >>>          1676.86477284,   1608.70437903,  26058.077443  ,  26778.50496589,
result >>>         14200.00719381,             nan,   7944.69135314,             nan,
result >>>          7836.3791763 ,  46864.95679091,  58617.97062639,   4736.83914255,
result >>>          2270.13030428,  23422.41797653,   1783.71512959,  26700.75608264,
result >>>         12933.94330355,  10588.17884122,   3627.9834637 ,   5201.03397093,
result >>>         11609.02662172,   2834.05481866,             nan,   6570.61624581,
result >>>                    nan,  71472.29596487,  19948.81948407,  17348.93653235,
result >>>         50745.68297711,  37258.22359457,  38380.17241178,   8821.31142953,
result >>>          9047.7689234 ,  42281.18818958,  25285.94773126,   3155.11426394,
result >>>          3552.09178948,   3736.96461203,   2108.53722771,  26382.78803193,
result >>>         36532.47268398,  74263.99862606,  14732.95458354,   6549.66799975,
result >>>         14308.75112714,    812.67394381,             nan,  12952.72516857,
result >>>         15210.95079266,   2622.06907623,   1701.37258736,             nan,
result >>>         12312.94005099,   6801.29184829,  10414.49383242,   2951.02189398,
result >>>         16689.09756884,  29862.31889041, 102389.43772763,  25587.38808847,
result >>>        105420.41423718,             nan,   7857.4877165 ,             nan,
result >>>          5332.28777311,   1506.23832521,  15347.99776172,  19515.40260527,
result >>>         17274.82307137,   4022.97863242,  11442.8331081 ,  14942.19764347,
result >>>          2125.71602653,  37928.34126866,   5721.22800363,  13113.97108338,
result >>>         17633.13298468,  12252.27719639,             nan,   1216.79278744,
result >>>          3852.52590437,  21102.5589232 ,   1168.82562558,  27682.6079403 ,
result >>>         56344.96375629,  10624.92711311,             nan,    986.20696062,
result >>>          5861.08965603,   5539.82671595,  50538.6065696 ,  58790.06140389,
result >>>          2477.9033451 ,  13966.49778022,  38565.40345261,  41885.92611335,
result >>>                    nan,  25029.76382064,   5235.47808502,  23008.66536715,
result >>>         13018.60861238,   7804.16780147,  16305.48534045,   4182.53934131,
result >>>         27383.254937  ,   3775.07502387,  37740.88825606,             nan,
result >>>         30658.63199533,   9567.33912818,             nan,   5700.08893902,
result >>>         44804.46017271,             nan, 127480.48251099,  23027.28926439,
result >>>         24788.67927912,   1912.90118206,   6062.85581457,  54416.61249319,
result >>>          4730.46186883,   2566.11917856,  87832.58651422,   2235.3146905 ,
result >>>          1476.21371969,   8616.81241755,  60932.93003722,             nan,
result >>>         14514.96055305,   3721.66067336,             nan,   3723.92091102,
result >>>         22092.44125736,   3237.3925436 ,  14966.7137949 ,  30460.38408107,
result >>>         32723.07195376,  48904.55437072,   8329.56058149,             nan,
result >>>         28383.89621333,             nan,             nan,   1990.72665731,
result >>>         13986.6283938 ,  20172.0612279 ,   1490.53623573,  16913.36607526,
result >>>          2979.31023902,  16875.9877489 ,  15126.46133663,   2140.35737389,
result >>>         13214.12254515,   5745.19122926,   6062.85581457,   3723.92091102,
result >>>         32854.71905846,  11595.51237921,  25247.2017505 ,   3651.02148404,
result >>>          2786.2722427 ,   1819.43435088,   8269.61463355,  16884.35346567,
result >>>         21619.60793375,  57638.15908799,   6512.68213067,  11456.78995224,
result >>>                    nan,             nan,             nan,   6295.59058534,
result >>>          3080.56575434,  16216.92698557,   6378.25656692,  10063.75847519,
result >>>          2507.47166249,  13196.81122696,   3933.06646044,   2027.08491653])
# The complement operator (~) can be used to remove nan elements
gdp_16 = gdp_16[~np.isnan(gdp_16)]
gdp_16

result >>> array([  1944.11700491,   6454.13537039,  11540.02556108,  16726.72218488,
result >>>         72399.65347339,  19939.93077479,   8832.76343477,  22661.48853616,
result >>>         46012.32845157,  50551.5531752 ,  17256.62697044,    777.75285166,
result >>>         46428.67142458,   2167.64282343,   1771.01528722,   3579.75679135,
result >>>         19242.62261574,  22516.81648228,  12172.06646607,  18066.3006624 ,
result >>>          8461.53565294,   7234.19524335,  15123.85001428,  18064.60442722,
result >>>         77420.61217204,   8900.76455309,  16956.72477025,    698.70665573,
result >>>         44819.48360027,  26851.1685408 ,  63888.73238665,  23193.97411052,
result >>>         15529.08410645,   3693.43690912,   3609.37559695,    801.63011974,
result >>>          5717.29037095,  14153.92792519,   1521.85721808,   6551.31894829,
result >>>         16609.73935418,  15370.63489842,  32707.872887  ,  34749.21236327,
result >>>         48860.52529211,  10947.71518781,  49029.01483891,  15204.93246446,
result >>>         15013.29637986,  13839.96863584,   9175.64442097,  17025.39067931,
result >>>         19516.08298373,  31366.86706838,  11242.04751604,  11128.80247428,
result >>>         42063.79406454,  36304.85427275,  29743.33742802,   1734.46407452,
result >>>         39610.86680734,   4254.34870787,  43378.14602882,   9109.95196333,
result >>>         41343.29253554,   3508.71519196,  18102.86221919,  42656.2166022 ,
result >>>         10004.5284119 ,   4292.44889992,   1966.37968826,   1676.86477284,
result >>>          1608.70437903,  26058.077443  ,  26778.50496589,  14200.00719381,
result >>>          7944.69135314,   7836.3791763 ,  46864.95679091,  58617.97062639,
result >>>          4736.83914255,   2270.13030428,  23422.41797653,   1783.71512959,
result >>>         26700.75608264,  12933.94330355,  10588.17884122,   3627.9834637 ,
result >>>          5201.03397093,  11609.02662172,   2834.05481866,   6570.61624581,
result >>>         71472.29596487,  19948.81948407,  17348.93653235,  50745.68297711,
result >>>         37258.22359457,  38380.17241178,   8821.31142953,   9047.7689234 ,
result >>>         42281.18818958,  25285.94773126,   3155.11426394,   3552.09178948,
result >>>          3736.96461203,   2108.53722771,  26382.78803193,  36532.47268398,
result >>>         74263.99862606,  14732.95458354,   6549.66799975,  14308.75112714,
result >>>           812.67394381,  12952.72516857,  15210.95079266,   2622.06907623,
result >>>          1701.37258736,  12312.94005099,   6801.29184829,  10414.49383242,
result >>>          2951.02189398,  16689.09756884,  29862.31889041, 102389.43772763,
result >>>         25587.38808847, 105420.41423718,   7857.4877165 ,   5332.28777311,
result >>>          1506.23832521,  15347.99776172,  19515.40260527,  17274.82307137,
result >>>          4022.97863242,  11442.8331081 ,  14942.19764347,   2125.71602653,
result >>>         37928.34126866,   5721.22800363,  13113.97108338,  17633.13298468,
result >>>         12252.27719639,   1216.79278744,   3852.52590437,  21102.5589232 ,
result >>>          1168.82562558,  27682.6079403 ,  56344.96375629,  10624.92711311,
result >>>           986.20696062,   5861.08965603,   5539.82671595,  50538.6065696 ,
result >>>         58790.06140389,   2477.9033451 ,  13966.49778022,  38565.40345261,
result >>>         41885.92611335,  25029.76382064,   5235.47808502,  23008.66536715,
result >>>         13018.60861238,   7804.16780147,  16305.48534045,   4182.53934131,
result >>>         27383.254937  ,   3775.07502387,  37740.88825606,  30658.63199533,
result >>>          9567.33912818,   5700.08893902,  44804.46017271, 127480.48251099,
result >>>         23027.28926439,  24788.67927912,   1912.90118206,   6062.85581457,
result >>>         54416.61249319,   4730.46186883,   2566.11917856,  87832.58651422,
result >>>          2235.3146905 ,   1476.21371969,   8616.81241755,  60932.93003722,
result >>>         14514.96055305,   3721.66067336,   3723.92091102,  22092.44125736,
result >>>          3237.3925436 ,  14966.7137949 ,  30460.38408107,  32723.07195376,
result >>>         48904.55437072,   8329.56058149,  28383.89621333,   1990.72665731,
result >>>         13986.6283938 ,  20172.0612279 ,   1490.53623573,  16913.36607526,
result >>>          2979.31023902,  16875.9877489 ,  15126.46133663,   2140.35737389,
result >>>         13214.12254515,   5745.19122926,   6062.85581457,   3723.92091102,
result >>>         32854.71905846,  11595.51237921,  25247.2017505 ,   3651.02148404,
result >>>          2786.2722427 ,   1819.43435088,   8269.61463355,  16884.35346567,
result >>>         21619.60793375,  57638.15908799,   6512.68213067,  11456.78995224,
result >>>          6295.59058534,   3080.56575434,  16216.92698557,   6378.25656692,
result >>>         10063.75847519,   2507.47166249,  13196.81122696,   3933.06646044,
result >>>          2027.08491653])
gdp_16.shape

result >>> (229,)
np.median(gdp_16)

result >>> 13113.9710833787
np.mean(gdp_16)

result >>> 19135.51651250454

The unofficial threshold for a country with a developed economy is a GDP per capita of USD 12,000. Some economists prefer to see a per capita GDP of at least USD 25,000 to be comfortable declaring a country as developed, however. Many highly developed countries, including the United States, have high per capita GDPs of USD 40,000 or above.

np.count_nonzero(gdp_16[gdp_16 > 40000])

result >>> 32
# 10 Lowest gdp per capita
np.sort(gdp_16)[0:10]

result >>> array([ 698.70665573,  777.75285166,  801.63011974,  812.67394381,
result >>>         986.20696062, 1168.82562558, 1216.79278744, 1476.21371969,
result >>>        1490.53623573, 1506.23832521])
# 10 Highest gdp per capita
np.sort(gdp_16)[-11:-1]
# np.sort(gdp_16)[219:]

result >>> array([ 58790.06140389,  60932.93003722,  63888.73238665,  71472.29596487,
result >>>         72399.65347339,  74263.99862606,  77420.61217204,  87832.58651422,
result >>>        102389.43772763, 105420.41423718])

How many countries have gdp per capita higher than Ukraine?

ukraine = gdp_16[gdp_16 > 8269.6], gdp_16[gdp_16 <= 8269.6]
# Hiegher GDP
np.count_nonzero(ukraine[0])

result >>> 145
# lower gdp
np.count_nonzero(ukraine[1])

result >>> 84
# Countries between Costa Rica and Finland
np.count_nonzero((gdp_16 > 16609.7) & (gdp_16 < 43378.1))

result >>> 65

#Indexing - Boolean Arrays

When we index arrays with arrays of (integer) indices we are providing the list of indices to pick. With boolean indices the approach is different; we explicitly choose which items in the array we want and which ones we don’t.

a = np.arange(16).reshape(4,4)
a

result >>> array([[ 0,  1,  2,  3],
result >>>        [ 4,  5,  6,  7],
result >>>        [ 8,  9, 10, 11],
result >>>        [12, 13, 14, 15]])
indx_bool = a > 9
indx_bool

result >>> array([[False, False, False, False],
result >>>        [False, False, False, False],
result >>>        [False, False,  True,  True],
result >>>        [ True,  True,  True,  True]])
a[indx_bool]

result >>> array([10, 11, 12, 13, 14, 15])
a[a > 9]

result >>> array([10, 11, 12, 13, 14, 15])
# filtering
a * (a > 9)

result >>> array([[ 0,  0,  0,  0],
result >>>        [ 0,  0,  0,  0],
result >>>        [ 0,  0, 10, 11],
result >>>        [12, 13, 14, 15]])
a < 6

result >>> array([[ True,  True,  True,  True],
result >>>        [ True,  True, False, False],
result >>>        [False, False, False, False],
result >>>        [False, False, False, False]])
np.count_nonzero(a < 6)

result >>> 6
np.sum(a < 6)

result >>> 6
# How many values less than 6 in each row?
np.sum(a < 6, axis=1)

result >>> array([4, 2, 0, 0])
# Are there any values greater than 8?
np.any(a > 8)

result >>> True
# Are all values less than 10?
np.all(a < 10)

result >>> False
# Are all values less than 100?
np.all(a < 100)

result >>> True
# Are all values in each row less than 9?
np.all(a < 9, axis=1)

result >>> array([ True,  True, False, False])

#nan’s

This trick doesn’t really works with float matrixes with nan values. You can run into RuntimeError.

#Structured Arrays

Structured arrays or record arrays are useful when you perform computations, and at the same time you could keep closely related data together. Structured arrays provide efficient storage for compound, heterogeneous data.

NumPy also provides powerful capabilities to create arrays of records, as multiple data types live in one NumPy array. However, one principle in NumPy that still needs to be honored is that the data type in each field (think of this as a column in the records) needs to be homogeneous.

name  = ["Alice","Beth","Cathy","Dorothy"]
studentId  = [1,2,3,4]
score = [85.4,90.4,87.66,78.9]

There’s nothing here that tells us that the three arrays are related; it would be more natural if we could use a single structure to store all of this data.

Define the np array with the names of the ‘columns’ and the data format for each

  • U10 represents a 10-character Unicode string
  • i4 is short for int32 (i for int, 4 for 4 bytes)
  • f8 is shorthand for float64
student_data = np.zeros(4, dtype={'names':('name', 'studentId', 'score'),
                          'formats':('U10', 'i4', 'f8')})
# np.zeros() for a string sets it to an empty string
student_data

result >>> array([('', 0, 0.), ('', 0, 0.), ('', 0, 0.), ('', 0, 0.)],
result >>>       dtype=[('name', '<U10'), ('studentId', '<i4'), ('score', '<f8')])
student_data.dtype

result >>> dtype([('name', '<U10'), ('studentId', '<i4'), ('score', '<f8')])

Now that we’ve created an empty container array, we can fill the array with our lists of values

student_data['name'] = name
student_data['studentId'] = studentId
student_data['score'] = score
student_data

result >>> array([('Alice', 1, 85.4 ), ('Beth', 2, 90.4 ), ('Cathy', 3, 87.66),
result >>>        ('Dorothy', 4, 78.9 )],
result >>>       dtype=[('name', '<U10'), ('studentId', '<i4'), ('score', '<f8')])

The handy thing with structured arrays is that you can now refer to values either by index or by name

student_data['name']

result >>> array(['Alice', 'Beth', 'Cathy', 'Dorothy'], dtype='<U10')
student_data['studentId']

result >>> array([1, 2, 3, 4], dtype=int32)
student_data['score']

result >>> array([85.4 , 90.4 , 87.66, 78.9 ])

If you index student_data at position 1 you get a structure:

student_data[1]

result >>> ('Beth', 2, 90.4)
# Get the name attribute from the last row
student_data[-1]['name']

result >>> 'Dorothy'
# Get names where score is above 85
student_data[student_data['score'] > 85]['name']

result >>> array(['Alice', 'Beth', 'Cathy'], dtype='<U10')

Note that if you’d like to do any operations that are any more complicated than these, you should probably consider the Pandas package with provides a powerful data structure called data frames.

#Broadcasting

  • Broadcasting is a powerful mechanism that allows numpy to work with arrays of different shapes when performing arithmetic operations.
  • Broadcasting solves the problem of arithmetic between arrays of differing shapes by in effect replicating the smaller array along the last mismatched dimension.
  • NumPy operations are usually done on pairs of arrays on an element-by-element basis. In the simplest case, the two arrays must have exactly the same shape, as in the following example:
a = np.array([1,2,3,4,5])
b = np.array([10,10,10,10,10])
a * b

result >>> array([10, 20, 30, 40, 50])

If the dimensions of two arrays are dissimilar, element-to-element operations are not possible. However, operations on arrays of non-similar shapes is still possible in NumPy, because of the broadcasting capability. The smaller array is broadcast to the size of the larger array so that they have compatible shapes.

a * 10

result >>> array([10, 20, 30, 40, 50])
# Scalar and n-Dimensional Array
np.ones((4,3)) * 10

result >>> array([[10., 10., 10.],
result >>>        [10., 10., 10.],
result >>>        [10., 10., 10.],
result >>>        [10., 10., 10.]])
# One-Dimensional and n-Dimensional Arrays
heights  = [165,170,168,183,172,169]
weights  = [61,76,56,81,62,60]
student_bio = np.array([heights,weights])
student_bio

result >>> array([[165, 170, 168, 183, 172, 169],
result >>>        [ 61,  76,  56,  81,  62,  60]])
factor_1 = np.array([0.0328084,2.20462 ])
factor_1.shape

result >>> (2,)
student_bio.shape

result >>> (2, 6)

General Broadcasting Rules

When operating on two arrays, NumPy compares their shapes element-wise. The dimensions are considered in reverse order, starting with the trailing dimensions, and working its way forward. Two dimensions are compatible when

  1. they are equal
  2. one of them is of size 1

Shape mismatch This fails because there is a mismatch in the trailing dimensions

  • student bio: 2 x 6
  • factor_1: 2

The trailing dimensions here are 6 and 2, so there is a mismatch

try:
    student_bio * factor_1
except ValueError as e:
    print("Error:", e)

> Error: operands could not be broadcast together with shapes (2,6) (2,)
factor_2 = np.array([[0.0328084],[2.20462 ]])
factor_2.shape

result >>> (2, 1)
student_bio * factor_2

result >>> array([[  5.413386 ,   5.577428 ,   5.5118112,   6.0039372,   5.6430448,
result >>>           5.5446196],
result >>>        [134.48182  , 167.55112  , 123.45872  , 178.57422  , 136.68644  ,
result >>>         132.2772   ]])

#Automatic Reshaping

If you’re specifying 2 out of 3 dimensions, the product of the two dimensions must be a divisor of the total number of elements in the array

a = np.arange(30)
a

result >>> array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
result >>>        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

-1 means “whatever is needed”

a.shape = 2,-1,3  
a

result >>> array([[[ 0,  1,  2],
result >>>         [ 3,  4,  5],
result >>>         [ 6,  7,  8],
result >>>         [ 9, 10, 11],
result >>>         [12, 13, 14]],
result >>> 
result >>>        [[15, 16, 17],
result >>>         [18, 19, 20],
result >>>         [21, 22, 23],
result >>>         [24, 25, 26],
result >>>         [27, 28, 29]]])
a.shape

result >>> (2, 5, 3)
a.shape = 2,3,-1
a

result >>> array([[[ 0,  1,  2,  3,  4],
result >>>         [ 5,  6,  7,  8,  9],
result >>>         [10, 11, 12, 13, 14]],
result >>> 
result >>>        [[15, 16, 17, 18, 19],
result >>>         [20, 21, 22, 23, 24],
result >>>         [25, 26, 27, 28, 29]]])
a.shape

result >>> (2, 3, 5)

#Vector Stacking

x = np.array([["Germany","France"],["Berlin","Paris"]])
y = np.array([["Hungary","Austria"],["Budapest","Vienna"]])
x.shape

result >>> (2, 2)
y.shape

result >>> (2, 2)
# Joining two arrays along axis 0
np.concatenate((x,y))

result >>> array([['Germany', 'France'],
result >>>        ['Berlin', 'Paris'],
result >>>        ['Hungary', 'Austria'],
result >>>        ['Budapest', 'Vienna']], dtype='<U8')
# Joining two arrays along axis 1 - Column-wise
np.concatenate((x,y), axis = 1)

result >>> array([['Germany', 'France', 'Hungary', 'Austria'],
result >>>        ['Berlin', 'Paris', 'Budapest', 'Vienna']], dtype='<U8')
a = np.array([1, 2, 3])
b = np.array([2, 3, 4])
np.stack((a, b))

result >>> array([[1, 2, 3],
result >>>        [2, 3, 4]])
studentId = np.array([1,2,3,4])
name   = np.array(["Alice","Beth","Cathy","Dorothy"])
scores  = np.array([65,78,90,81])
np.stack((studentId, name, scores))

result >>> array([['1', '2', '3', '4'],
result >>>        ['Alice', 'Beth', 'Cathy', 'Dorothy'],
result >>>        ['65', '78', '90', '81']], dtype='<U21')
np.stack((studentId, name, scores)).shape

result >>> (3, 4)
np.stack((studentId, name, scores), axis =1)

result >>> array([['1', 'Alice', '65'],
result >>>        ['2', 'Beth', '78'],
result >>>        ['3', 'Cathy', '90'],
result >>>        ['4', 'Dorothy', '81']], dtype='<U21')
np.stack((studentId, name, scores), axis =1).shape

result >>> (4, 3)
# Stacks row wise 

np.vstack((studentId, name, scores))

result >>> array([['1', '2', '3', '4'],
result >>>        ['Alice', 'Beth', 'Cathy', 'Dorothy'],
result >>>        ['65', '78', '90', '81']], dtype='<U21')
# Stacks column wise
np.hstack((studentId, name, scores))

result >>> array(['1', '2', '3', '4', 'Alice', 'Beth', 'Cathy', 'Dorothy', '65',
result >>>        '78', '90', '81'], dtype='<U21')
np.hstack((studentId, name, scores)).shape

result >>> (12,)

#Histograms

np.histogram([1, 2, 1] , bins = [0, 1, 2, 3])

result >>> (array([0, 2, 1]), array([0, 1, 2, 3]))

normal distribution

Mean = 2
StdDev = 0.5
Num of data points = 1000

mu, sigma = 2 , 0.5
data = np.random.normal(mu, sigma, 10000)
data

result >>> array([2.3202593 , 2.57649503, 2.60027418, ..., 1.57583234, 1.92830072,
result >>>        1.26634083])
(n, bin_edges) = np.histogram( data , bins = 50 )

Here, bins is an integer, it defines the number of equal-width bins in the given range (10, by default).

Numpy computes the occurrences of input data that fall within each bin, which in turns determines the area (not necessarily the height if the bins aren’t of equal width) of each bar.

  • n == number of samples in each bin
  • bin_edges = The right edges of the bins

NOTE: The number of bin_edges == len(histogram)+1. This is because bin_edges[0] is the left edge of the first bin, while all other values are the right edges of each bin

n

result >>> array([  3,   1,   0,   5,   6,   8,  11,  18,  21,  24,  43,  57,  75,
result >>>         91, 126, 145, 204, 257, 320, 363, 373, 459, 494, 547, 555, 564,
result >>>        592, 597, 559, 537, 469, 395, 373, 383, 296, 239, 204, 161, 141,
result >>>         82,  65,  40,  25,  28,  13,  12,  10,   3,   3,   3])
bin_edges

result >>> array([0.07511042, 0.14816475, 0.22121909, 0.29427342, 0.36732775,
result >>>        0.44038208, 0.51343641, 0.58649074, 0.65954507, 0.7325994 ,
result >>>        0.80565374, 0.87870807, 0.9517624 , 1.02481673, 1.09787106,
result >>>        1.17092539, 1.24397972, 1.31703406, 1.39008839, 1.46314272,
result >>>        1.53619705, 1.60925138, 1.68230571, 1.75536004, 1.82841438,
result >>>        1.90146871, 1.97452304, 2.04757737, 2.1206317 , 2.19368603,
result >>>        2.26674036, 2.3397947 , 2.41284903, 2.48590336, 2.55895769,
result >>>        2.63201202, 2.70506635, 2.77812068, 2.85117502, 2.92422935,
result >>>        2.99728368, 3.07033801, 3.14339234, 3.21644667, 3.289501  ,
result >>>        3.36255534, 3.43560967, 3.508664  , 3.58171833, 3.65477266,
result >>>        3.72782699])
plt.plot(bin_edges[ 1 : ], n)
plt.show()

matplotlib also has a function to build histograms (called hist) that differs from the one in NumPy.

The main difference is that pylab.hist plots the histogram automatically, while numpy.histogram only generates the data.

plt.hist([1, 2, 1], bins = [0, 1, 2, 3])
plt.show()

plt.hist(data, bins = 50)
plt.show()

# Bin selection can be automated
plt.hist(data, bins = 'auto')  
plt.show()

#Functions and Mehtods

url = "https://gist.githubusercontent.com/butuzov/ed1c7f8f3affe6dd005c1ee40dc3f7f2/raw/9012bdf4a623aa10e5abe3932028f79609337c19/sallary.csv"
salaries = np.genfromtxt(url, delimiter=',')
salaries

result >>> array([60000., 58000., 56967., ..., 54647., 25000., 70000.])
# Returns the indices of the maximum values along an axis.
salaries[np.argmax(salaries)]

result >>> 850000.0
# argmin
salaries[np.argmin(salaries)]

result >>> 11400.0
np.argsort(salaries)

result >>> array([282, 969, 606, ..., 829, 389, 246])
s = np.array([4,6,7,82,3,4])
s.shape = 2, -1
s

result >>> array([[ 4,  6,  7],
result >>>        [82,  3,  4]])
np.argsort(s)

result >>> array([[0, 1, 2],
result >>>        [1, 2, 0]])

specifying sorting algorithm

np.argsort(salaries,kind='mergesort')

result >>> array([282, 969, 606, ..., 829, 389, 246])
np.argsort(salaries,kind='quicksort')

result >>> array([282, 969, 606, ..., 829, 389, 246])

The functions max , min , sort gives the respective elements itself instead of indices.

The where() function returns the indices of elements in an input array where the given condition is satisfied.

np.where(salaries > 100000)

result >>> (array([  22,   27,   33,   45,   48,   59,   77,   78,   87,   91,   94,
result >>>          102,  109,  117,  123,  151,  221,  236,  242,  243,  246,  271,
result >>>          296,  297,  303,  337,  343,  378,  385,  389,  402,  408,  432,
result >>>          436,  450,  454,  503,  504,  538,  560,  563,  580,  581,  602,
result >>>          607,  687,  701,  728,  736,  745,  769,  779,  802,  819,  822,
result >>>          829,  859,  913,  922,  946,  949,  977,  980,  998, 1019, 1033,
result >>>         1062, 1065, 1104, 1105, 1112, 1130]),)

The extract() function returns the elements satisfying any condition

np.extract(salaries > np.mean(salaries), salaries)

result >>> array([ 60000.,  58000.,  56967.,  70000.,  75000.,  62000.,  56000.,
result >>>        261546.,  77000., 500000.,  77570.,  60000.,  65000.,  60000.,
result >>>        120000.,  81000.,  75000.,  75000.,  73000.,  75000., 210000.,
result >>>         90000., 105227.,  84000.,  98000.,  70000., 115000.,  72000.,
result >>>         57400.,  70000.,  71796., 130773., 103529.,  57000.,  70000.,
result >>>         75000.,  57000., 119875.,  60000., 132000., 100000., 141671.,
result >>>         87500.,  90000.,  65000., 109100.,  98080.,  87689., 550000.,
result >>>         56697.,  60000., 180000.,  92000.,  63000., 220000.,  66000.,
result >>>         80000.,  65460.,  65000., 109294.,  70000.,  57758.,  75000.,
result >>>         75000.,  62662.,  64000.,  90000.,  60000.,  60000.,  70000.,
result >>>         63000.,  80000.,  70000.,  60000.,  90000., 135380.,  65000.,
result >>>         70000., 103529.,  64000.,  70000., 120000., 185000.,  70000.,
result >>>        850000.,  61500.,  60000.,  63000., 181339.,  75665.,  58000.,
result >>>         57000., 100000.,  65000.,  62662.,  90000.,  70835.,  75000.,
result >>>        111600., 102000.,  60000.,  65000., 625000., 100000.,  78000.,
result >>>         75000.,  60000.,  64000.,  63000.,  59656.,  69000.,  80750.,
result >>>         84703., 110464., 102000.,  56732.,  63000.,  62000.,  60000.,
result >>>         70151.,  60000.,  56400.,  65000., 140000.,  75000.,  65000.,
result >>>        120000., 820000.,  80000., 250000.,  58000., 130000.,  85000.,
result >>>         60000.,  65000.,  71925., 136375.,  90000., 141000.,  90000.,
result >>>         65000.,  66000.,  72000.,  65000., 142800.,  68000., 116626.,
result >>>         65000.,  56000.,  57758.,  58000.,  60000.,  60000.,  97000.,
result >>>         67382.,  90000.,  85000.,  75000.,  57000.,  60000.,  76600.,
result >>>         60000., 110000., 130000.,  61398.,  60000.,  65000.,  70000.,
result >>>        100000.,  59000.,  57000.,  60000.,  76000.,  60000.,  88000.,
result >>>        444000.,  59000.,  70000.,  72000., 100000.,  70000., 116000.,
result >>>         65000., 110000.,  70000.,  82000.,  56000.,  60000.,  72000.,
result >>>         60000.,  70000., 113996., 160000.,  60000.,  81733.,  66000.,
result >>>         80000.,  66000., 150000.,  56000., 108000.,  62031., 100000.,
result >>>         60000.,  68000.,  60000.,  60000.,  60000.,  90000.,  61700.,
result >>>         75000.,  77000.,  96000.,  60000.,  86000.,  68000.,  63000.,
result >>>         65000.,  60000.,  90000.,  90000.,  80000.,  63000.,  80000.,
result >>>         70915., 120000.,  73500.,  60000.,  60000., 120000.,  56000.,
result >>>         84396.,  95000.,  94704.,  71000.,  70000.,  60000., 220000.,
result >>>        117701.,  57758.,  80000.,  65000.,  72000.,  70000., 120000.,
result >>>         60000.,  60000.,  63500.,  60000.,  59640.,  61027.,  70000.,
result >>>         61800., 120010.,  83500.,  75000.,  72000.,  84000., 110000.,
result >>>         60005.,  59500.,  70518.,  66000.,  58000.,  70000.,  99500.,
result >>>         66858., 103645.,  77000.,  56000.,  62000.,  60000.,  57400.,
result >>>         92735.,  58000.,  69000., 102000.,  68000., 108977.,  75653.,
result >>>         68000.,  59938., 680000.,  56967.,  57600.,  75000.,  60000.,
result >>>         60000.,  58000., 152568.,  98324.,  65000.,  63605.,  82000.,
result >>>         57000.,  62000.,  85000.,  60000.,  62500.,  68656.,  87280.,
result >>>         60000.,  57400.,  76000.,  63207.,  59000., 114800.,  60000.,
result >>>         72000., 150000.,  70000.,  60000.,  57000.,  70000.,  59938.,
result >>>        111841., 110000.,  63000.,  63207.,  59000.,  64000.,  58000.,
result >>>         75000.,  75000.,  70500.,  61000.,  60555., 108000.,  60000.,
result >>>         80000., 102000., 100000.,  64800.,  62000., 134449.,  75000.,
result >>>         70000.,  65000.,  94000.,  65000.,  60000., 120000.,  80000.,
result >>>         95000.,  65000., 101349.,  74000.,  70000.,  76289.,  95000.,
result >>>         78000.,  72000.,  92000.,  65000., 120000.,  72000., 250000.,
result >>>         68500.,  62000.,  81000.,  70000.,  84000.,  56000.,  87689.,
result >>>        175000., 261546.,  59000.,  56000., 120000.,  82000.,  70000.,
result >>>         71380.,  62000., 100000.,  80000., 103000.,  57000.,  76800.,
result >>>         75000.,  90000.,  70000.])

#Vectorization

def addsubtract(a,b):
    "Return a-b if a > b, otherwise return a + b"
    if a > b:
        return a - b
    else:
        return a + b
addsubtract(5,7)

result >>> 12
vec_addsubtract = np.vectorize(addsubtract)
vec_addsubtract([0,3,6,9],[1,3,5,7])

result >>> array([1, 6, 1, 2])
# wrong shape
try:
    vec_addsubtract([0,3,6,9],[1,3,5,7,8])
except ValueError as e:
    print("Error:", e)

> Error: operands could not be broadcast together with shapes (4,) (5,)

#Tips and tricks

#Computing order of elements in array

data = np.random.random(10)
np.argsort(np.argsort(data))

result >>> array([4, 0, 7, 3, 2, 6, 5, 8, 9, 1])
!pip install scipy

> Requirement already satisfied: scipy in /Users/butuzov/.venv/lib/python3.11/site-packages (1.11.4)
> Requirement already satisfied: numpy<1.28.0,>=1.21.6 in /Users/butuzov/.venv/lib/python3.11/site-packages (from scipy) (1.26.1)
> 
> [notice] A new release of pip is available: 23.3.1 -> 23.3.2
> [notice] To update, run: pip install --upgrade pip
# scipy function which does the same, but it's more general and faster, so prefer using it:
from scipy.stats import rankdata
rankdata(data) - 1

result >>> array([4., 0., 7., 3., 2., 6., 5., 8., 9., 1.])

#IronTransform (flattener of distribution)

Sometimes you need to write monotonic tranformation, which turns one distribution into uniform.

This method is useful to compare distributions or to work with distributions with heavy tails or strange shape.

class IronTransform:
    def fit(self, data, weights):
        weights = weights / weights.sum()
        sorter = np.argsort(data)
        self.x = data[sorter]
        self.y = np.cumsum(weights[sorter])
        return self
        
    def transform(self, data):
        return np.interp(data, self.x, self.y)
    
    
sig_pred = np.random.normal(size=10000) + 1
bck_pred = np.random.normal(size=10000) - 1 

plt.figure()
plt.hist(sig_pred, bins=30, alpha=0.5)
plt.hist(bck_pred, bins=30, alpha=0.5)
plt.show()

iron = IronTransform().fit(sig_pred, weights=np.ones(len(sig_pred)))

plt.figure()
plt.hist(iron.transform(sig_pred), bins=30, alpha=0.5)
plt.hist(iron.transform(bck_pred), bins=30, alpha=0.5)
plt.show()

flatnonzero

## flattennonzero
np.flatnonzero(np.arange(10).reshape(2,5))

result >>> array([1, 2, 3, 4, 5, 6, 7, 8, 9])
np.flatnonzero(np.arange(10) % 2 == 0 )

result >>> array([0, 2, 4, 6, 8])