Basic Data Analysis and Visualization using Pandas and Matplotlib

Author: Iqbal


Sales Data (100000)
In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Load Data

In [33]:
col_name =["Region", "Country", "Item Type", "Sales Channel", "Order Priority", 
       "Order Date", "Order ID", "Ship Date", "Unit Sold", "Unit Price",
      "Unit Cost", "Total Revenue", "Total Cost", "Total Profit"]

dtype = {"Unit Sold": np.int8, "Unit Price": np.float, "Unit Cost": np.float, "Total Revenue": np.float, "Total Cost": np.float, "Total Profit": np.float}

df = pd.read_csv('data/sales.csv', dtype=dtype)
df
Out[33]:
Region Country Item Type Sales Channel Order Priority Order Date Order ID Ship Date Units Sold Unit Price Unit Cost Total Revenue Total Cost Total Profit
0 Middle East and North Africa Azerbaijan Snacks Online C 10/8/2014 535113847 10/23/2014 934 152.58 97.44 142509.72 91008.96 51500.76
1 Central America and the Caribbean Panama Cosmetics Offline L 2/22/2015 874708545 2/27/2015 4551 437.20 263.33 1989697.20 1198414.83 791282.37
2 Sub-Saharan Africa Sao Tome and Principe Fruits Offline M 12/9/2015 854349935 1/18/2016 9986 9.33 6.92 93169.38 69103.12 24066.26
3 Sub-Saharan Africa Sao Tome and Principe Personal Care Online M 9/17/2014 892836844 10/12/2014 9118 81.73 56.67 745214.14 516717.06 228497.08
4 Central America and the Caribbean Belize Household Offline H 2/4/2010 129280602 3/5/2010 5858 668.27 502.54 3914725.66 2943879.32 970846.34
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
99995 Sub-Saharan Africa Niger Cereal Offline L 8/26/2012 836322486 9/11/2012 5263 205.70 117.11 1082599.10 616349.93 466249.17
99996 Europe Poland Meat Offline C 12/3/2013 110449349 12/10/2013 3272 421.89 364.69 1380424.08 1193265.68 187158.40
99997 Sub-Saharan Africa Comoros Clothes Online M 8/7/2013 193128764 8/31/2013 9948 109.28 35.84 1087117.44 356536.32 730581.12
99998 Middle East and North Africa Kuwait Cosmetics Online L 6/28/2011 701597058 7/3/2011 7015 437.20 263.33 3066958.00 1847259.95 1219698.05
99999 Sub-Saharan Africa Tanzania Cosmetics Offline C 4/3/2012 423403060 4/30/2012 3229 437.20 263.33 1411718.80 850292.57 561426.23

100000 rows × 14 columns

In [5]:
df.describe()
Out[5]:
Order ID Unit Sold Unit Price Unit Cost Total Revenue Total Cost Total Profit
count 3.357400e+04 33574.000000 33574.000000 33574.000000 3.357400e+04 3.357400e+04 3.357300e+04
mean 5.508671e+08 5011.257908 267.000522 188.060721 1.339089e+06 9.429690e+05 3.961163e+05
std 2.587609e+08 2886.473979 217.135148 175.770843 1.472910e+06 1.151906e+06 3.807359e+05
min 1.000896e+08 1.000000 9.330000 6.920000 1.866000e+01 1.384000e+01 4.820000e+00
25% 3.283191e+08 2511.000000 109.280000 56.670000 2.828506e+05 1.636720e+05 9.654390e+04
50% 5.486556e+08 5031.000000 205.700000 117.110000 7.920981e+05 4.669959e+05 2.854945e+05
75% 7.732641e+08 7491.000000 437.200000 263.330000 1.849440e+06 1.218626e+06 5.717862e+05
max 9.999965e+08 10000.000000 668.270000 524.960000 6.680695e+06 5.249075e+06 1.738700e+06

Check Outlier

In [6]:
df["Unit Sold"].plot(kind="box", vert=False, figsize=(14,6))
Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f45e924d460>

Check Data Density and Median

In [7]:
df["Unit Sold"].plot(kind="density", figsize=(14,6))
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f45e71decd0>
In [8]:
ax = df["Unit Sold"].plot(kind="density", figsize=(14,6))
# ax.axvline(df["Unit Sold"].mean(), color="black")
ax.axvline(df["Unit Sold"].median(), color="green")
Out[8]:
<matplotlib.lines.Line2D at 0x7f45dd541a90>

Count Item Type

In [9]:
df["Item Type"].value_counts()
Out[9]:
Cosmetics          2856
Clothes            2841
Snacks             2819
Household          2812
Office Supplies    2811
Cereal             2809
Baby Food          2798
Fruits             2778
Vegetables         2768
Beverages          2767
Meat               2760
Personal Care      2755
Name: Item Type, dtype: int64

Find Coorelation among classes

In [10]:
corr = df.corr()
corr
Out[10]:
Order ID Unit Sold Unit Price Unit Cost Total Revenue Total Cost Total Profit
Order ID 1.000000 0.003635 -0.001731 -0.001185 0.001908 0.002415 0.000088
Unit Sold 0.003635 1.000000 0.001724 0.001081 0.523309 0.470639 0.600575
Unit Price -0.001731 0.001724 1.000000 0.986008 0.739844 0.755079 0.577703
Unit Cost -0.001185 0.001081 0.986008 1.000000 0.729265 0.765580 0.505001
Total Revenue 0.001908 0.523309 0.739844 0.729265 1.000000 0.987649 0.880500
Total Cost 0.002415 0.470639 0.755079 0.765580 0.987649 1.000000 0.795349
Total Profit 0.000088 0.600575 0.577703 0.505001 0.880500 0.795349 1.000000
In [11]:
fig = plt.figure(figsize=(8,8))
plt.matshow(corr, cmap="RdYlGn", fignum=fig.number)
plt.xticks(range(len(corr.columns)), corr.columns, rotation='vertical')
plt.yticks(range(len(corr.columns)), corr.columns)
Out[11]:
([<matplotlib.axis.YTick at 0x7f45dd4b6c40>,
  <matplotlib.axis.YTick at 0x7f45dd4b6820>,
  <matplotlib.axis.YTick at 0x7f45dd543490>,
  <matplotlib.axis.YTick at 0x7f45dd4e4ac0>,
  <matplotlib.axis.YTick at 0x7f45dd4e80a0>,
  <matplotlib.axis.YTick at 0x7f45dd4e8640>,
  <matplotlib.axis.YTick at 0x7f45dd4e8be0>],
 <a list of 7 Text yticklabel objects>)
In [12]:
df.plot(kind="scatter", x="Unit Sold", y="Total Revenue", figsize=(6,6))
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f45dd5208e0>
In [14]:
df.loc[df["Region"]== 'Europe']
Out[14]:
Region Country Item Type Sales Channel Order Priority Order Date Order ID Ship Date Unit Sold Unit Price Unit Cost Total Revenue Total Cost Total Profit
2 Europe Slovenia Cosmetics Online M 11/26/2014 200732934 12/15/2014 191 437.20 263.33 83505.20 50296.03 33209.17
3 Europe Poland Office Supplies Offline M 6/27/2017 259352290 7/18/2017 6961 651.21 524.96 4533072.81 3654246.56 878826.25
6 Europe Kosovo Cosmetics Offline H 5/8/2013 706545197 6/20/2013 1280 437.20 263.33 559616.00 337062.40 222553.60
13 Europe Spain Fruits Online C 6/13/2011 155975234 6/27/2011 5393 9.33 6.92 50316.69 37319.56 12997.13
19 Europe United Kingdom Cosmetics Offline L 11/18/2016 959627711 12/11/2016 4020 437.20 263.33 1757544.00 1058586.60 698957.40
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
33552 Europe Romania Vegetables Online L 5/23/2012 901529347 6/27/2012 1423 154.06 90.93 219227.38 129393.39 89833.99
33558 Europe Estonia Baby Food Offline L 2/28/2017 396188294 3/20/2017 9775 255.28 159.42 2495362.00 1558330.50 937031.50
33563 Europe Lithuania Fruits Online M 1/11/2016 494030559 2/10/2016 3476 9.33 6.92 32431.08 24053.92 8377.16
33567 Europe Norway Cosmetics Online M 2/2/2015 388398730 2/22/2015 4670 437.20 263.33 2041724.00 1229751.10 811972.90
33569 Europe Albania Office Supplies Offline C 12/24/2014 290511405 1/2/2015 7635 651.21 524.96 4971988.35 4008069.60 963918.75

8623 rows × 14 columns

Group wise data

In [24]:
df.loc[df["Region"]== 'Europe', 'Total Profit'].sum()
Out[24]:
3378667382.08
In [33]:
df.groupby(["Region", "Item Type"])[["Unit Sold", "Total Revenue", "Total Profit"]].sum().astype(int)
Out[33]:
Unit Sold Total Revenue Total Profit
Region Item Type
Asia Baby Food 2061860 526351620 197649899
Beverages 2047342 97146377 32061375
Cereal 1998923 411178461 177084588
Clothes 2138045 233645557 157018024
Cosmetics 1902179 831632658 330731862
... ... ... ... ...
Sub-Saharan Africa Office Supplies 3836754 2498532572 484390192
Personal Care 3509859 286860776 87957066
Snacks 3735799 570008211 205991956
Vegetables 3487264 537247891 220150976
stralia and Oceania Vegetables 7766 1196429 490267

85 rows × 3 columns