Data handling and manipulation with Pandas

February 07, 2020 14 minute read Follow @GaryLi

What is pandas?

Pandas is python data analysis library which provides integrated and function rich utilities for data handling, wrangling, and analysis of the datasets.

How to install pandas?

# using Miniconda
# to install specific version provide pandas=version number (e.g. pandas=1.0)
conda install pandas

# using pip
pip install pandas

Check pandas version

# you can use interactive python interpreter, jupyter notebook or python code
# I am using interactive python interpreter (Python 3.7)
>>> import pandas as pd
>>> pd.__version__
'0.25.3'

Create dataframe and series using pandas

# you can use interactive python interpreter, jupyter notebook or python code
# I am using interactive python interpreter (Python 3.7)

# create pandas dataframe (two-dimensional)
>>> import pandas as pd
# dataframe from dict
>>> df1 = pd.DataFrame({'col1':['A', 'B', 'C', 'D', 'E'], 'col2':[1,2,3,4,5], 'col3':[0.1,0.2,0.3,0.4,0.5]})
>>> df1
col1 	col2 	col3
0 	A 	1 	0.1
1 	B 	2 	0.2
2 	C 	3 	0.3
3 	D 	4 	0.4
4 	E 	5 	0.5

# Create a dataframe with some random numbers
>>> import pandas as pd
>>> import numpy as np
>>> df2 = pd.DataFrame(np.random.randint(0,100,size=(1000, 10)), columns=['col%i' % i for i in np.arange(10)])
# print first 5 rows
>>> df2.head()
   col0  col1  col2  col3  col4  col5  col6  col7  col8  col9
0     2    72     7    23    82    76    29    86    55    41
1    37    23    73    62    53    70    50    30     5    11
2    38    13    64    54    19    35    93    67    25    13
3    55    58    82    92    35    24     0    57     7     6
4    75    65    82    68    16    49    50    86    52    65

# create pandas series (one-dimensional)
df3 = pd.Series(np.random.randint(0,100, size=10))
>>> df3.head()
0     5
1    37
2    59
3    53
4    37
dtype: int32

Import data from file

# read csv file
>>> import pandas as pd
>>> df4 = pd.read_csv("https://reneshbedre.github.io/assets/posts/volcano/testvolcano.csv")
>>> df4.head()
          GeneNames  value1  value2    log2FC       p-value
0  LOC_Os09g01000.1    8862   32767 -1.886539  1.250000e-55
1  LOC_Os12g42876.1    1099     117  3.231611  1.050000e-55
2  LOC_Os12g42884.2     797      88  3.179004  2.590000e-54
3  LOC_Os03g16920.1     274       7  5.290677  4.690000e-54
4  LOC_Os05g47540.4     308      18  4.096862  2.190000e-54

# read tab file
>>> df5 = pd.read_csv("https://reneshbedre.github.io/assets/posts/anova/onewayanova.txt", sep="\t")
>>> df5.head()
    A   B   C   D
0  25  45  30  54
1  30  55  29  60
2  28  29  33  51
3  36  56  37  62
4  29  40  27  73

# make first column of table as index
>>> df6 = pd.read_csv("https://reneshbedre.github.io/assets/posts/volcano/testvolcano.csv", index_col=0)
>>> df6.head()
                  value1  value2    log2FC       p-value
GeneNames
LOC_Os09g01000.1    8862   32767 -1.886539  1.250000e-55
LOC_Os12g42876.1    1099     117  3.231611  1.050000e-55
LOC_Os12g42884.2     797      88  3.179004  2.590000e-54
LOC_Os03g16920.1     274       7  5.290677  4.690000e-54
LOC_Os05g47540.4     308      18  4.096862  2.190000e-54

# import specific columns only
# you can also replace numbers with column names in usecols argument
>>> df7 = pd.read_csv("https://reneshbedre.github.io/assets/posts/volcano/testvolcano.csv", usecols=[0,3,4])
>>> df7.head()
          GeneNames    log2FC       p-value
0  LOC_Os09g01000.1 -1.886539  1.250000e-55
1  LOC_Os12g42876.1  3.231611  1.050000e-55
2  LOC_Os12g42884.2  3.179004  2.590000e-54
3  LOC_Os03g16920.1  5.290677  4.690000e-54
4  LOC_Os05g47540.4  4.096862  2.190000e-54

# skip first n rows
# you can also skip rows from bottom using skipfooter argument
>>> df8 = pd.read_csv("https://reneshbedre.github.io/assets/posts/volcano/testvolcano.csv", index_col=0, skiprows=5, header=None)
>>> df8.head()
                    1     2         3             4
0
LOC_Os05g47540.4  308    18  4.096862  2.190000e-54
LOC_Os09g00999.1  339  1213 -1.839222  1.950000e-54
LOC_Os01g62900.1  193     4  5.592457  1.560000e-54
LOC_Os09g27030.2  504   162  1.637430  1.360000e-54
LOC_Os01g51410.2  225    22  3.354350  6.760000e-54

# read only first n rows
>>> df9 = pd.read_csv("https://reneshbedre.github.io/assets/posts/volcano/testvolcano.csv", index_col=0, nrows=100)
>>> df9
                  value1  value2    log2FC       p-value
GeneNames
LOC_Os09g01000.1    8862   32767 -1.886539  1.250000e-55
LOC_Os12g42876.1    1099     117  3.231611  1.050000e-55
LOC_Os12g42884.2     797      88  3.179004  2.590000e-54
LOC_Os03g16920.1     274       7  5.290677  4.690000e-54
LOC_Os05g47540.4     308      18  4.096862  2.190000e-54
[100 rows x 4 columns]

# skip comment lines
# below command will skip any lines starting with #
# only one character allowed for comments (# is okay, but ## will not work)
>>> df = pd.read_csv("file.csv", comment='#')

Check data type (dtype)

Check a data type of each column in the dataframe

>>> df1.dtypes
Col1     object
col2      int64
col3    float64
dtype: object

Check dataframe dimensions (size, shape and ndim)

Check the total number of elements, columns, and rows in a dataframe

# get total number of elements in a dataframe 
>>> df1.size
15
>>> df2.size
10000

# get number of rows and columns
>>> df1.shape
(5, 3)  # (rows, columns)
>>> df2.shape
(1000, 10) # (rows, columns)

# get the array dimensions (number of axes)
# returns 1 for pandas series (one-dimensional) and 2 for dataframes (two-dimensional) 
>>> df1.ndim
2
>>> df2.ndim
2
>>> df3.ndim
1

Extracting columns and rows from dataframe

# get column data based on column name
>>> df1['col2']
0    1
1    2
2    3
3    4
4    5
Name: col2, dtype: int64

# get row data based on row index
# return row output will be series 
# extract row with index 0 
>>> df1.loc[0] 
Col1      A
col2      1
col3    0.1
Name: 0, dtype: object

# assign first column (col1) as index
>>> df1_1 = df1.set_index('col1')
>>> df1_1
      col2  col3
col1
A        1   0.1
B        2   0.2
C        3   0.3
D        4   0.4
E        5   0.5

# get rows data based on row index (col1 as index)
>>> df1_1.loc['E']
col2    5.0
col3    0.5
Name: E, dtype: float64

Extracting subsets of dataframes

# get subset of dataframe [row, column]
>>> df1_1.loc['A':'C', 'col2':'col3']
      col2  col3
col1
A        1   0.1
B        2   0.2
C        3   0.3

# based on default numeric index
>>> df2.loc[5:9, 'col2':'col6']
   col2  col3  col4  col5  col6
5    49     5    61    87    23
6    20    57    40    21    17
7    30    30    38    94    50
8    98    17    31    67    59
9    39     9     4    17     8

# get specific columns with all rows
>>> df2[['col1', 'col5', 'col8']]
     col1  col5  col8
0      21     2    14
1      93    29    93
2      23    24    32
3      26    72    55
4      17    96    40

# get specific rows with all columns
>>> df2[2:6]
   col0  col1  col2  col3  col4  col5  col6  col7  col8  col9
2     4    23    64    49    71    24    97    92    32    39
3    73    26    33    99    62    72    73     3    55    82
4    72    17    73    71    42    96    56    29    40    96
5    16    10    49     5    61    87    23     7    36     1

# get specific columns with specific rows
>>> df2.loc[2:4, ['col1', 'col5', 'col8']]

Extracting rows and columns based on conditional match

import pandas as pd
# dataframe from dict
df = pd.DataFrame({'col1':['A', 'A', 'C', 'D', 'E'], 'col2':[1,2,3,4,5], 'col3':[0.1,0.2,0.3,0.4,0.5]})
df
# output
  col1  col2  col3
0    A     1   0.1
1    A     2   0.2
2    C     3   0.3
3    D     4   0.4
4    E     5   0.5

# extract rows where col1 has values A 
df.loc[df['col1'] == 'A']
# output
  col1  col2  col3
0    A     1   0.1
1    A     2   0.2

# extract rows where col1 has values A and D
df.loc[df['col1'].isin(['A', 'D'])]
# output
  col1  col2  col3
0    A     1   0.1
1    A     2   0.2
3    D     4   0.4

Statistics of dataframes

# check above to create dataframe objects (df1, df2 and df1_1)
# get descriptive statistics (count, mean, std dev, min, max, and quartiles)
# it excludes all NaN or missing values
>>> df1.describe()
           col2      col3
count  5.000000  5.000000
mean   3.000000  0.300000
std    1.581139  0.158114
min    1.000000  0.100000
25%    2.000000  0.200000
50%    3.000000  0.300000
75%    4.000000  0.400000
max    5.000000  0.500000

# get only certain element like mean ,count
# get mean
>>> df1.describe().loc['mean']
col2    3.0
col3    0.3
Name: mean, dtype: float64

# get mean and std dev
>>> df1.describe().loc[['mean', 'std']]
          col2      col3
mean  3.000000  0.300000
std   1.581139  0.158114

# get standard error of mean (sem)
>>> df1.sem()
col2    0.707107
col3    0.070711
dtype: float64

# for specific columns only
>>> df2['col4'].describe()
count    1000.000000
mean       48.917000
std        28.679865
min         0.000000
25%        25.000000
50%        49.000000
75%        74.000000
max        99.000000
Name: col4, dtype: float64

# for only selected multiple columns
>>> df2.loc[:,'col1':'col5'].describe()
              col1         col2         col3         col4         col5
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean     49.334000    51.057000    48.676000    48.917000    48.486000
std      28.311375    29.141293    28.252065    28.679865    29.070619
min       0.000000     0.000000     0.000000     0.000000     0.000000
25%      25.000000    26.000000    25.000000    25.000000    22.000000
50%      50.000000    52.000000    49.000000    49.000000    50.000000
75%      73.000000    77.000000    73.000000    74.000000    73.000000
max      99.000000    99.000000    99.000000    99.000000    99.000000

# to get descriptive statistics by row-wise you can transpose the columns
# or use apply function
>>> df1_1.apply(pd.DataFrame.describe, axis=1)
      count  mean       std  min    25%   50%    75%  max
col1
A       2.0  0.55  0.636396  0.1  0.325  0.55  0.775  1.0
B       2.0  1.10  1.272792  0.2  0.650  1.10  1.550  2.0
C       2.0  1.65  1.909188  0.3  0.975  1.65  2.325  3.0
D       2.0  2.20  2.545584  0.4  1.300  2.20  3.100  4.0
E       2.0  2.75  3.181981  0.5  1.625  2.75  3.875  5.0

# to get descriptive statistics by row-wise but only for specific columns
>> df2.loc[:,'col0':'col4'].apply(pd.DataFrame.describe, axis=1)
     count  mean        std   min   25%   50%   75%   max
0      5.0  43.8  37.325594  12.0  12.0  30.0  69.0  96.0
1      5.0  52.8  34.687173   1.0  43.0  55.0  71.0  94.0
2      5.0  61.6  36.287739  14.0  34.0  72.0  90.0  98.0
3      5.0  70.0  32.992423  21.0  52.0  83.0  96.0  98.0
4      5.0  43.4  34.681407   3.0   9.0  59.0  73.0  73.0
..

Add/Insert rows and columns to existing dataframes

# check above to create dataframe objects (df1, df2 and df1_1)
# add extra row to existing df1 dataframe
>>>df1_2 = df1.append({'col1': 'F', 'col2': 6, 'col3': 0.6}, ignore_index=True)
# ignore_index option will ignore the current index and append to df1
# dataframe.This option is useful when appending two dataframe and to
# preserve the original index of each dataframes
>>> df1_2
  col1  col2  col3
0    A     1   0.1
1    B     2   0.2
2    C     3   0.3
3    D     4   0.4
4    E     5   0.5
5    F     6   0.6

# adding a column to dataframe
>>> df1_2['col4'] = [1, 4, 9, 16, 25,36]
>>> df1_2
  col1  col2  col3  col4
0    A     1   0.1     1
1    B     2   0.2     4
2    C     3   0.3     9
3    D     4   0.4    16
4    E     5   0.5    25
5    F     6   0.6    36

# merge/append two dataframes
# create a new dataframe 
>>> df10 = pd.DataFrame({'col1':['X', 'Y'], 'col2':[7,8], 'col3':[0.7,0.8]})
>>> df10
  col1  col2  col3
0    X     7   0.7
1    Y     8   0.8

# mereg df10 with df1_2
>>> df1_2.append(df10)
  col1  col2  col3  col4
0    A     1   0.1   1.0
1    B     2   0.2   4.0
2    C     3   0.3   9.0
3    D     4   0.4  16.0
4    E     5   0.5  25.0
5    F     6   0.6  36.0
0    X     7   0.7   NaN
1    Y     8   0.8   NaN

# if you set ignore_index=True, the index order will not be preserved
>>> df1_2.append(df10, ignore_index=True)
  col1  col2  col3  col4
0    A     1   0.1   1.0
1    B     2   0.2   4.0
2    C     3   0.3   9.0
3    D     4   0.4  16.0
4    E     5   0.5  25.0
5    F     6   0.6  36.0
6    X     7   0.7   NaN
7    Y     8   0.8   NaN

Export dataframes to file

# check above to create dataframe objects (df1)
# Export to CSV file
# with first column as index
>>> df1.to_csv("df1.csv")
# do not export index as first column
>>> df1.to_csv("df1.csv", index=False)
# Export to TAB file
>>> df1.to_csv("df1.csv", sep='\t')
# no column headers
>>> df1.to_csv("df1.csv", header=False)
# compress output file (works with pandas 1.0.0)
>>> df1.to_csv("df1.zip", compression={'method':'zip', 'archive_name':'df1.csv'})
# export to excel file
# use similar option as above for headers and index
>>> df1.to_excel("df1.xlsx")

Convert series to dataframes

# create pandas series
>>> df11 = pd.Series([1,2,3])
>>> df11
0    1
1    2
2    3
dtype: int64

# convert to dataframe
>>> df11.to_frame()
   0
0  1
1  2
2  3

# add column name
>>> df11.to_frame('col1')
   col1
0     1
1     2
2     3

Convert numpy arrays to dataframes

>>> import numpy as np
>>> x = np.array([[ 1,  2], [ 3 ,4]])

# convert to dataframe
df12 = pd.DataFrame({'col1':x[:,0], 'col2':x[:,1]})
>>> df12
   col1  col2
0     1     2
1     3     4

Convert dataframe to numpy arrays

# see df12 dataframe in above section
# convert whole dataframe to numpy array
>>> df12.to_numpy()
array([[1, 2],
       [3, 4]])

# extract specific column as numpy array
>>> df12['col1'].to_numpy()
array([1, 3])

# extract multiple columns as numpy array
>>> df12[['col1','col2']].to_numpy()
array([[1, 2],
       [3, 4]])

# extract specific row as numpy array
>>> df12.loc[0].to_numpy()
array([1, 2])

# extract multiple row as numpy array
>>> df12.loc[0:1].to_numpy()
array([[1, 2],
       [3, 4]])

Find min and max values in dataframe

Create a dataframe with some random numbers

>>> import pandas as pd
>>> import numpy as np
>>> df = pd.read_csv("https://reneshbedre.github.io/assets/posts/anova/onewayanova.txt", sep="\t")
>>> df.head()
    A   B   C   D
0  25  45  30  54
1  30  55  29  60
2  28  29  33  51
3  36  56  37  62
4  29  40  27  73

# find min in every column
>>> df.min()
A    25
B    29
C    27
D    51
dtype: int64

# find max in every column
>>> df.max()
A    36
B    56
C    37
D    73
dtype: int64

# find min and max in every rows
>>> df.min(axis=1)
>>> df.max(axis=1)

# find max and min from whole dataframe
>>> np.min(df.min().to_numpy())
25
>>> np.max(df.max().to_numpy())
73

# find min and max in particular columns or rows
# max in column A
>>> df['A'].max()
36
# max in row 2
>>> df.loc[1].max()
60
# multiple columns
>>> df[['A', 'D']].max()
A    36
D    73
dtype: int64
# multiple rows (0 and 1 row)
>>> df.loc[0:1,].max(axis=1)
0    54
1    60
dtype: int64

# get row index of max and min values for each column
>>> df.idxmax()
A    3
B    3
C    3
D    4
dtype: int64
# for min df.idxmin()

# to skip missing values (NaN),  default skipna is True
# to not to skip missing values add skipna=False
>>> df.min(skipna=True)

Modifying dataframe columns and column names

import pandas as pd
df = pd.read_csv("https://reneshbedre.github.io/assets/posts/anova/onewayanova.txt", sep="\t")
df.head()
    A   B   C   D
0  25  45  30  54
1  30  55  29  60
2  28  29  33  51
3  36  56  37  62
4  29  40  27  73

# check column names
df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')

# change column names
df.columns = ['A1', 'B1', 'C1', 'D1']
df.head()
    A1 	B1 	C1 	D1
0 	25 	45 	30 	54
1 	30 	55 	29 	60
2 	28 	29 	33 	51
3 	36 	56 	37 	62
4 	29 	40 	27 	73

# reorder column names
df1 = df[['B1', 'A1', 'C1', 'D1']]
# alternate way: df1 = df.reindex(columns=['B1', 'A1', 'C1', 'D1'])
df1
   B1  A1  C1  D1
0  45  25  30  54
1  55  30  29  60
2  29  28  33  51
3  56  36  37  62
4  40  29  27  73

# add new column
df['E']=[55,58,61,70,85]
df.head()
    A1 	B1 	C1 	D1 	E
0 	25 	45 	30 	54 	55
1 	30 	55 	29 	60 	58
2 	28 	29 	33 	51 	61
3 	36 	56 	37 	62 	70
4 	29 	40 	27 	73 	85

# drop column D1 and E
# axis=1 refers to column labels (use 0 for index labels)
df.drop(['D1', 'E'], axis=1)
    A1 	B1 	C1
0 	25 	45 	30
1 	30 	55 	29
2 	28 	29 	33
3 	36 	56 	37
4 	29 	40 	27

# rename specific columns
df.rename(columns={"A1": "a1", "B1": "b1"})
    a1 	b1 	C1 	D1 	E
0 	25 	45 	30 	54 	55
1 	30 	55 	29 	60 	58
2 	28 	29 	33 	51 	61
3 	36 	56 	37 	62 	70
4 	29 	40 	27 	73 	85

join two dataframes (left, right, outer and inner joins)

>>> import pandas as pd
>>> df1 = pd.DataFrame({'col1':['A', 'B', 'C', 'D', 'E', 'F', 'I'], 'col2':[1,2,3,4,5,6,8] })
>>> df1
  col1  col2
0    A     1
1    B     2
2    C     3
3    D     4
4    E     5
5    F     6
6    I     8
>>> df2 = pd.DataFrame({'col1':['A', 'B', 'C', 'D', 'E', 'G'], 'col3':[0.1,0.2,0.3,0.4,0.5,0.7] })
>>> df2
  col1  col3
0    A   0.1
1    B   0.2
2    C   0.3
3    D   0.4
4    E   0.5
5    G   0.7

# left join df1 and df2 (join based on all keys from df1)
>>> df_left_joined = pd.merge(df1, df2, how='left', on='col1')
>>> df_left_joined
  col1  col2  col3
0    A     1   0.1
1    B     2   0.2
2    C     3   0.3
3    D     4   0.4
4    E     5   0.5
5    F     6   NaN
6    I     8   NaN

# right join df1 and df2 (join based on all keys from df2)
>>> df_right_joined = pd.merge(df1, df2, how='right', on='col1')
>>> df_right_joined
  col1  col2  col3
0    A   1.0   0.1
1    B   2.0   0.2
2    C   3.0   0.3
3    D   4.0   0.4
4    E   5.0   0.5
5    G   NaN   0.7

# outer join df1 and df2 (join based on union of keys of df1 and df2)
>>> df_outer_joined = pd.merge(df1, df2, how='outer', on='col1')
>>> df_outer_joined
  col1  col2  col3
0    A   1.0   0.1
1    B   2.0   0.2
2    C   3.0   0.3
3    D   4.0   0.4
4    E   5.0   0.5
5    F   6.0   NaN
6    I   8.0   NaN
7    G   NaN   0.7

# inner join df1 and df2 (join based on common keys of df1 and df2)
>>> df_inner_joined = pd.merge(df1, df2, how='inner', on='col1')
>>> df_inner_joined
  col1  col2  col3
0    A     1   0.1
1    B     2   0.2
2    C     3   0.3
3    D     4   0.4
4    E     5   0.5

Correlation analysis with pandas dataframe

>>> import pandas as pd
>>> df = pd.read_csv("https://reneshbedre.github.io/assets/posts/anova/onewayanova.txt", sep="\t")
>>> df.head()
    A   B   C   D
0  25  45  30  54
1  30  55  29  60
2  28  29  33  51
3  36  56  37  62
4  29  40  27  73

# perform pearson correlation analysis
# check more methods at https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.corr.html
# returns correlation coefficient matrix
>>> df.corr(method='pearson')
          A         B         C         D
A  1.000000  0.569327  0.657544  0.341802
B  0.569327  1.000000  0.194615  0.264710
C  0.657544  0.194615  1.000000 -0.391609
D  0.341802  0.264710 -0.391609  1.000000

Compare all rows or columns

>>> import pandas as pd
>>> df = pd.DataFrame({'col1':['0', '0', '0' ], 'col2':[0,2,3], 'col3':[0,0.2,0.3]})
>>> df
  col1  col2  col3
0    0     0   0.0
1    0     2   0.2
2    0     3   0.3

# delete all rows where all columns are zero
>>> df[(df!=0).all(axis=1)]
  col1  col2  col3
1    0     2   0.2
2    0     3   0.3

# get all rows where columns sum > 3
>>> df[(df.sum(axis=1)>3)]
  col1  col2  col3
2    0     3   0.3

Last updated: November 09, 2020

Twitter Facebook LinkedIn