Read csv from s3

simple and easy explanation of how to read an s3 csv from jupyter: https://www.youtube.com/watch?v=2hfCRrmFcH0

In [1]:
import boto3
In [2]:
client = boto3.client('s3')
In [3]:
path = 's3://richardfrancis.info/CN7030/covid_19_dataset.csv'
In [6]:
pip install s3fs
Collecting s3fs
  Downloading https://files.pythonhosted.org/packages/b8/e4/b8fc59248399d2482b39340ec9be4bb2493846ac23641b43115a7e5cd675/s3fs-0.4.2-py3-none-any.whl
Requirement already satisfied: botocore>=1.12.91 in c:\users\user\anaconda3\lib\site-packages (from s3fs) (1.15.39)
Collecting fsspec>=0.6.0 (from s3fs)
  Downloading https://files.pythonhosted.org/packages/24/1e/69c46eb946e8dd9dc62fc0b62e200cefd08632924605130c9f8afa3b2991/fsspec-0.7.2-py3-none-any.whl (67kB)
Requirement already satisfied: docutils<0.16,>=0.10 in c:\users\user\anaconda3\lib\site-packages (from botocore>=1.12.91->s3fs) (0.15.2)
Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in c:\users\user\anaconda3\lib\site-packages (from botocore>=1.12.91->s3fs) (0.9.5)
Requirement already satisfied: urllib3<1.26,>=1.20; python_version != "3.4" in c:\users\user\anaconda3\lib\site-packages (from botocore>=1.12.91->s3fs) (1.24.2)
Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in c:\users\user\anaconda3\lib\site-packages (from botocore>=1.12.91->s3fs) (2.8.0)
Requirement already satisfied: six>=1.5 in c:\users\user\anaconda3\lib\site-packages (from python-dateutil<3.0.0,>=2.1->botocore>=1.12.91->s3fs) (1.12.0)
Installing collected packages: fsspec, s3fs
  Found existing installation: fsspec 0.5.2
    Uninstalling fsspec-0.5.2:
      Successfully uninstalled fsspec-0.5.2
Successfully installed fsspec-0.7.2 s3fs-0.4.2
Note: you may need to restart the kernel to use updated packages.
In [7]:
import pandas as pd
In [8]:
df = pd.read_csv(path)
In [9]:
df.head()
Out[9]:
FIPS Admin2 Province_State Country_Region Last_Update Lat Long_ Confirmed Deaths Recovered Active Combined_Key
0 45001.0 Abbeville South Carolina US 08/04/2020 22:51 34.223334 -82.461707 5 0 0 0 Abbeville, South Carolina, US
1 22001.0 Acadia Louisiana US 08/04/2020 22:51 30.295065 -92.414197 86 2 0 0 Acadia, Louisiana, US
2 51001.0 Accomack Virginia US 08/04/2020 22:51 37.767072 -75.632346 11 0 0 0 Accomack, Virginia, US
3 16001.0 Ada Idaho US 08/04/2020 22:51 43.452658 -116.241552 438 3 0 0 Ada, Idaho, US
4 19001.0 Adair Iowa US 08/04/2020 22:51 41.330756 -94.471059 1 0 0 0 Adair, Iowa, US
In [16]:
len(df)
Out[16]:
2883
In [17]:
df.shape
Out[17]:
(2883, 12)
In [13]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2883 entries, 0 to 2882
Data columns (total 12 columns):
FIPS              2603 non-null float64
Admin2            2614 non-null object
Province_State    2703 non-null object
Country_Region    2883 non-null object
Last_Update       2883 non-null object
Lat               2823 non-null float64
Long_             2823 non-null float64
Confirmed         2883 non-null int64
Deaths            2883 non-null int64
Recovered         2883 non-null int64
Active            2883 non-null int64
Combined_Key      2883 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 270.4+ KB
In [15]:
df.describe()
Out[15]:
FIPS Lat Long_ Confirmed Deaths Recovered Active
count 2603.000000 2823.000000 2823.000000 2883.000000 2883.000000 2883.000000 2883.000000
mean 31174.546293 36.509878 -80.231851 524.142907 30.640999 113.999653 238.892473
std 17320.418275 10.087278 41.051477 5638.119517 515.026579 1984.874098 3423.157161
min 66.000000 -51.796300 -159.856183 0.000000 0.000000 0.000000 0.000000
25% 18072.000000 33.710967 -95.279100 3.000000 0.000000 0.000000 0.000000
50% 29125.000000 37.793446 -86.983101 11.000000 0.000000 0.000000 0.000000
75% 46053.000000 41.435951 -80.651280 51.000000 1.000000 0.000000 0.000000
max 99999.000000 71.706900 178.065000 148220.000000 17669.000000 64142.000000 95262.000000
In [18]:
df["Province_State"].value_counts()
Out[18]:
Texas             168
Georgia           156
Virginia          134
Tennessee          96
North Carolina     93
                 ... 
Curacao             1
Hebei               1
Ontario             1
Mayotte             1
Fujian              1
Name: Province_State, Length: 137, dtype: int64
In [ ]:
df.loc["Province_State"]