##############################
##############################
import pymysql
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
##############################
## QUESTION 1 ##
##############################
# Connect to the world database
= pymysql.connect(
connection = "dsc140.cmgtlybtlnmh.us-east-2.rds.amazonaws.com",
host = "guest",
user = "UMUdsc140",
password ="world")
db
= connection.cursor() conn
##############################
## QUESTION 2 ##
##############################
# Show all of the table names and format the returned data so it can be
# used later
= "SHOW TABLES"
show_tables = conn.execute(show_tables)
output = conn.fetchall()
table_names = [i[0] for i in table_names]
table_names print("Table Names")
print(table_names)
# For every table, print the names of the columns of data in the table
for table_name in table_names:
= "SHOW COLUMNS FROM "+table_name
get_cols = conn.execute(get_cols)
output = conn.fetchall()
col_names = [i[0] for i in col_names]
col_names print(table_name)
print(col_names)
print()
Table Names
['City', 'Country', 'CountryLanguage']
City
['ID', 'Name', 'CountryCode', 'District', 'Population']
Country
['Code', 'Name', 'Continent', 'Region', 'SurfaceArea', 'IndepYear', 'Population', 'LifeExpectancy', 'GNP', 'GNPOld', 'LocalName', 'GovernmentForm', 'HeadOfState', 'Capital', 'Code2']
CountryLanguage
['CountryCode', 'Language', 'IsOfficial', 'Percentage']
##############################
## QUESTION 3 ##
##############################
# Get the column names from the CountryLanguage Table
= "SHOW COLUMNS FROM CountryLanguage"
get_cols = conn.execute(get_cols)
output = conn.fetchall()
col_names = [i[0] for i in col_names]
col_names
# Get the data from the CountryLanguage table
= "SELECT * FROM CountryLanguage"
languages = conn.execute(languages)
output = conn.fetchall()
data
# Combine everything into a single dataframe
= pd.DataFrame(data, columns=col_names)
country_language country_language
CountryCode | Language | IsOfficial | Percentage | |
---|---|---|---|---|
0 | ABW | Dutch | T | 5.3 |
1 | ABW | English | F | 9.5 |
2 | ABW | Papiamento | F | 76.7 |
3 | ABW | Spanish | F | 7.4 |
4 | AFG | Balochi | F | 0.9 |
... | ... | ... | ... | ... |
979 | ZMB | Tongan | F | 11.0 |
980 | ZWE | English | T | 2.2 |
981 | ZWE | Ndebele | F | 16.2 |
982 | ZWE | Nyanja | F | 2.2 |
983 | ZWE | Shona | F | 72.1 |
984 rows × 4 columns
##############################
## QUESTION 4 ##
##############################
# Use an inner join to relate the name and region of the country from the Country table
# to the country language from the CountryLanguage table. Only keep results if the language
# is spanish
= "SELECT c.Name, c.Region FROM Country c INNER JOIN CountryLanguage l \
spanish_only ON c.Code=l.CountryCode WHERE l.Language LIKE '%Spanish%'"
= conn.execute(spanish_only)
output = conn.fetchall()
data # Put the data in a dataframe, print the length of the dataframe and then the dataframe
= pd.DataFrame(data, columns=["Name", "Region"])
spanish_countries print("Number of Spanish Speaking Countries:", len(spanish_only))
spanish_countries
Number of Spanish Speaking Countries: 129
Name | Region | |
---|---|---|
0 | Aruba | Caribbean |
1 | Andorra | Southern Europe |
2 | Argentina | South America |
3 | Belize | Central America |
4 | Bolivia | South America |
5 | Canada | North America |
6 | Chile | South America |
7 | Colombia | South America |
8 | Costa Rica | Central America |
9 | Cuba | Caribbean |
10 | Dominican Republic | Caribbean |
11 | Ecuador | South America |
12 | Spain | Southern Europe |
13 | France | Western Europe |
14 | Guatemala | Central America |
15 | Honduras | Central America |
16 | Mexico | Central America |
17 | Nicaragua | Central America |
18 | Panama | Central America |
19 | Peru | South America |
20 | Puerto Rico | Caribbean |
21 | Paraguay | South America |
22 | El Salvador | Central America |
23 | Sweden | Nordic Countries |
24 | Uruguay | South America |
25 | United States | North America |
26 | Venezuela | South America |
27 | Virgin Islands, U.S. | Caribbean |
##############################
## QUESTION 5 ##
##############################
# Get the column names from the Country table
= "SHOW COLUMNS FROM Country"
get_cols = conn.execute(get_cols)
output = conn.fetchall()
col_names = [i[0] for i in col_names]
col_names
# Get the data from the Country table
= "SELECT * FROM Country"
languages = conn.execute(languages)
output = conn.fetchall()
data
# Convert the retrieved data into a DataFrame
= pd.DataFrame(data, columns=col_names)
country
# Drop null values
= country.dropna()
country
# Extract all of the regions
= list(set(country['Region']))
regions
# Create empty lists to store the average life expectancy and life
# expectancies for every country in a region
= []
avg_life_expect = []
life_expect # Iterate through every region
for region in regions:
# Create a sub-dataframe with only countries in a given region
= country[country["Region"]==region]
region_data # Temporary array to store the life expectancies of each country in a
# region
= []
life_expect_temp # Iterate through each country in a region
for i in range(len(region_data)):
# Save the life expectancy
"LifeExpectancy"].iloc[i])
life_expect_temp.append(region_data[# Store the list of life expectancies and the average life expectancy for
# each region
life_expect.append(life_expect_temp)
avg_life_expect.append(np.mean(life_expect_temp))
# Sorted the saved data by average life expectancy
= sorted(zip(avg_life_expect,regions,life_expect))
life_expect_sorted
# Extract, in order of increasing average life expectancy, the list of
# life expectancies and the region
= []
life_expect_sorted_data = []
sorted_regions for row in life_expect_sorted:
= row
avg, region, data
sorted_regions.append(region)
life_expect_sorted_data.append(data)
# Create a boxplot of the life expectancy data
plt.boxplot(life_expect_sorted_data)# Change the x-ticks, add x and y labels
range(len(sorted_regions)), sorted_regions, rotation=90)
plt.xticks('Region', fontsize=14)
plt.xlabel("Life Expectancy", fontsize=14) plt.ylabel(
Text(0, 0.5, 'Life Expectancy')