Julie Butler – mysql_homeworksolutions

##############################
##############################
import pymysql
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

##############################
##        QUESTION 1        ##
##############################
# Connect to the world database
connection = pymysql.connect(
host = "dsc140.cmgtlybtlnmh.us-east-2.rds.amazonaws.com",
user = "guest",
password = "UMUdsc140",
db="world")

conn = connection.cursor()

##############################
##        QUESTION 2        ##
##############################
# Show all of the table names and format the returned data so it can be
# used later
show_tables = "SHOW TABLES"
output = conn.execute(show_tables)
table_names = conn.fetchall()
table_names = [i[0] for i in table_names]
print("Table Names")
print(table_names)

# For every table, print the names of the columns of data in the table
for table_name in table_names:
    get_cols = "SHOW COLUMNS FROM "+table_name
    output = conn.execute(get_cols)
    col_names = conn.fetchall()
    col_names = [i[0] for i in col_names]
    print(table_name)
    print(col_names)
    print()

Table Names
['City', 'Country', 'CountryLanguage']
City
['ID', 'Name', 'CountryCode', 'District', 'Population']

Country
['Code', 'Name', 'Continent', 'Region', 'SurfaceArea', 'IndepYear', 'Population', 'LifeExpectancy', 'GNP', 'GNPOld', 'LocalName', 'GovernmentForm', 'HeadOfState', 'Capital', 'Code2']

CountryLanguage
['CountryCode', 'Language', 'IsOfficial', 'Percentage']

##############################
##        QUESTION 3        ##
##############################
# Get the column names from the CountryLanguage Table
get_cols = "SHOW COLUMNS FROM CountryLanguage"
output = conn.execute(get_cols)
col_names = conn.fetchall()
col_names = [i[0] for i in col_names]

# Get the data from the CountryLanguage table
languages = "SELECT * FROM CountryLanguage"
output = conn.execute(languages)
data = conn.fetchall()

# Combine everything into a single dataframe
country_language = pd.DataFrame(data, columns=col_names)
country_language

	CountryCode	Language	IsOfficial	Percentage
0	ABW	Dutch	T	5.3
1	ABW	English	F	9.5
2	ABW	Papiamento	F	76.7
3	ABW	Spanish	F	7.4
4	AFG	Balochi	F	0.9
...	...	...	...	...
979	ZMB	Tongan	F	11.0
980	ZWE	English	T	2.2
981	ZWE	Ndebele	F	16.2
982	ZWE	Nyanja	F	2.2
983	ZWE	Shona	F	72.1

984 rows × 4 columns

##############################
##        QUESTION 4        ##
##############################
# Use an inner join to relate the name and region of the country from the Country table
# to the country language from the CountryLanguage table. Only keep results if the language
# is spanish
spanish_only = "SELECT c.Name, c.Region FROM Country c INNER JOIN CountryLanguage l \
    ON c.Code=l.CountryCode WHERE l.Language LIKE '%Spanish%'"
output = conn.execute(spanish_only)
data = conn.fetchall()
# Put the data in a dataframe, print the length of the dataframe and then the dataframe
spanish_countries = pd.DataFrame(data, columns=["Name", "Region"])
print("Number of Spanish Speaking Countries:", len(spanish_only))
spanish_countries

Number of Spanish Speaking Countries: 129

	Name	Region
0	Aruba	Caribbean
1	Andorra	Southern Europe
2	Argentina	South America
3	Belize	Central America
4	Bolivia	South America
5	Canada	North America
6	Chile	South America
7	Colombia	South America
8	Costa Rica	Central America
9	Cuba	Caribbean
10	Dominican Republic	Caribbean
11	Ecuador	South America
12	Spain	Southern Europe
13	France	Western Europe
14	Guatemala	Central America
15	Honduras	Central America
16	Mexico	Central America
17	Nicaragua	Central America
18	Panama	Central America
19	Peru	South America
20	Puerto Rico	Caribbean
21	Paraguay	South America
22	El Salvador	Central America
23	Sweden	Nordic Countries
24	Uruguay	South America
25	United States	North America
26	Venezuela	South America
27	Virgin Islands, U.S.	Caribbean

##############################
##        QUESTION 5        ##
##############################
# Get the column names from the Country table
get_cols = "SHOW COLUMNS FROM Country"
output = conn.execute(get_cols)
col_names = conn.fetchall()
col_names = [i[0] for i in col_names]

# Get the data from the Country table
languages = "SELECT * FROM Country"
output = conn.execute(languages)
data = conn.fetchall()

# Convert the retrieved data into a DataFrame
country = pd.DataFrame(data, columns=col_names)

# Drop null values
country = country.dropna()

# Extract all of the regions
regions = list(set(country['Region']))

# Create empty lists to store the average life expectancy and life
# expectancies for every country in a region
avg_life_expect = []
life_expect = []
# Iterate through every region
for region in regions:
    # Create a sub-dataframe with only countries in a given region
    region_data = country[country["Region"]==region]
    # Temporary array to store the life expectancies of each country in a 
    # region
    life_expect_temp = []
    # Iterate through each country in a region
    for i in range(len(region_data)):
        # Save the life expectancy
        life_expect_temp.append(region_data["LifeExpectancy"].iloc[i])
    # Store the list of life expectancies and the average life expectancy for
    # each region
    life_expect.append(life_expect_temp)
    avg_life_expect.append(np.mean(life_expect_temp))

# Sorted the saved data by average life expectancy    
life_expect_sorted = sorted(zip(avg_life_expect,regions,life_expect))

# Extract, in order of increasing average life expectancy, the list of 
# life expectancies and the region
life_expect_sorted_data = []
sorted_regions = []
for row in life_expect_sorted:
    avg, region, data = row
    sorted_regions.append(region)
    life_expect_sorted_data.append(data)
 
# Create a boxplot of the life expectancy data   
plt.boxplot(life_expect_sorted_data)
# Change the x-ticks, add x and y labels
plt.xticks(range(len(sorted_regions)), sorted_regions, rotation=90)
plt.xlabel('Region', fontsize=14)
plt.ylabel("Life Expectancy", fontsize=14)

Text(0, 0.5, 'Life Expectancy')