Covid 19 dataset

In [1]:
covid_19_ds = spark.read.csv('./covid_19_dataset.csv', inferSchema=True, header=True)
In [2]:
covid_19_ds.show()
+-----+---------+--------------+--------------+----------------+-----------+------------+---------+------+---------+------+--------------------+
| FIPS|   Admin2|Province_State|Country_Region|     Last_Update|        Lat|       Long_|Confirmed|Deaths|Recovered|Active|        Combined_Key|
+-----+---------+--------------+--------------+----------------+-----------+------------+---------+------+---------+------+--------------------+
|45001|Abbeville|South Carolina|            US|08/04/2020 22:51|34.22333378|-82.46170658|        5|     0|        0|     0|Abbeville, South ...|
|22001|   Acadia|     Louisiana|            US|08/04/2020 22:51| 30.2950649|-92.41419698|       86|     2|        0|     0|Acadia, Louisiana...|
|51001| Accomack|      Virginia|            US|08/04/2020 22:51|37.76707161|-75.63234615|       11|     0|        0|     0|Accomack, Virgini...|
|16001|      Ada|         Idaho|            US|08/04/2020 22:51| 43.4526575|-116.2415516|      438|     3|        0|     0|      Ada, Idaho, US|
|19001|    Adair|          Iowa|            US|08/04/2020 22:51|41.33075609|-94.47105874|        1|     0|        0|     0|     Adair, Iowa, US|
|21001|    Adair|      Kentucky|            US|08/04/2020 22:51|37.10459774|-85.28129668|        3|     0|        0|     0| Adair, Kentucky, US|
|29001|    Adair|      Missouri|            US|08/04/2020 22:51|40.19058551|-92.60078167|       11|     0|        0|     0| Adair, Missouri, US|
|40001|    Adair|      Oklahoma|            US|08/04/2020 22:51|35.88494195|-94.65859267|       25|     2|        0|     0| Adair, Oklahoma, US|
| 8001|    Adams|      Colorado|            US|08/04/2020 22:51|39.87432092|-104.3362578|      417|    14|        0|     0| Adams, Colorado, US|
|16003|    Adams|         Idaho|            US|08/04/2020 22:51|44.89333571|-116.4545247|        1|     0|        0|     0|    Adams, Idaho, US|
|17001|    Adams|      Illinois|            US|08/04/2020 22:51|39.98815591|-91.18786813|        1|     0|        0|     0| Adams, Illinois, US|
|18001|    Adams|       Indiana|            US|08/04/2020 22:51| 40.7457653|-84.93671406|        2|     0|        0|     0|  Adams, Indiana, US|
|28001|    Adams|   Mississippi|            US|08/04/2020 22:51|31.47669768|-91.35326037|       25|     0|        0|     0|Adams, Mississipp...|
|31001|    Adams|      Nebraska|            US|08/04/2020 22:51| 40.5244942|-98.50117804|       27|     0|        0|     0| Adams, Nebraska, US|
|39001|    Adams|          Ohio|            US|08/04/2020 22:51|38.84541072| -83.4718964|        2|     0|        0|     0|       Adams,Ohio,US|
|42001|    Adams|  Pennsylvania|            US|08/04/2020 22:51|39.87140411|-77.21610347|       33|     0|        0|     0|Adams, Pennsylvan...|
|53001|    Adams|    Washington|            US|08/04/2020 22:51|46.98299757|-118.5601734|       29|     0|        0|     0|Adams, Washington...|
|55001|    Adams|     Wisconsin|            US|08/04/2020 22:51|43.96974651|-89.76782777|        2|     0|        0|     0|Adams, Wisconsin, US|
|50001|  Addison|       Vermont|            US|08/04/2020 22:51|44.03217337|-73.14130877|       47|     0|        0|     0|Addison, Vermont, US|
|45003|    Aiken|South Carolina|            US|08/04/2020 22:51|33.54338026|-81.63645384|       28|     1|        0|     0|Aiken, South Caro...|
+-----+---------+--------------+--------------+----------------+-----------+------------+---------+------+---------+------+--------------------+
only showing top 20 rows

In [3]:
covid_19_ds.columns
Out[3]:
['FIPS',
 'Admin2',
 'Province_State',
 'Country_Region',
 'Last_Update',
 'Lat',
 'Long_',
 'Confirmed',
 'Deaths',
 'Recovered',
 'Active',
 'Combined_Key']
In [4]:
ds = covid_19_ds
In [5]:
ds.count()
Out[5]:
2883
In [6]:
ds.head()
Out[6]:
Row(FIPS=45001, Admin2='Abbeville', Province_State='South Carolina', Country_Region='US', Last_Update='08/04/2020 22:51', Lat=34.22333378, Long_=-82.46170658, Confirmed=5, Deaths=0, Recovered=0, Active=0, Combined_Key='Abbeville, South Carolina, US')
In [7]:
ds.printSchema()
root
 |-- FIPS: integer (nullable = true)
 |-- Admin2: string (nullable = true)
 |-- Province_State: string (nullable = true)
 |-- Country_Region: string (nullable = true)
 |-- Last_Update: string (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Long_: double (nullable = true)
 |-- Confirmed: integer (nullable = true)
 |-- Deaths: integer (nullable = true)
 |-- Recovered: integer (nullable = true)
 |-- Active: integer (nullable = true)
 |-- Combined_Key: string (nullable = true)

In [11]:
ds.describe().select('Summary', 'Recovered', 'Deaths').show()
+-------+------------------+------------------+
|Summary|         Recovered|            Deaths|
+-------+------------------+------------------+
|  count|              2883|              2883|
|   mean|113.99965313909122|30.640998959417274|
| stddev|1984.8740981822637|  515.026578561955|
|    min|                 0|                 0|
|    max|             64142|             17669|
+-------+------------------+------------------+

In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: