#############
## IMPORTS ##
#############
# Install and import pymysql to connect to
# the database
!#pip install pymysql
import pymysql
# Pandas for dataframes
import pandas as pd
# Matplotlib for graphs
import matplotlib.pyplot as plt
#############################
## CONNECT TO THE DATABASE ##
#############################
= pymysql.connect(
connection = "dsc140.cmgtlybtlnmh.us-east-2.rds.amazonaws.com",
host = "guest",
user = "UMUdsc140",
password ="orders")
db
= connection.cursor() conn
################
## QUESTION 1 ##
################
# Identify and print the names of all the tables in the database.
# Look at the columns within some of the tables and briefly describe
# what this database appears to be all about.
# Gather the names of the tables in the database
= "SHOW TABLES"
tables = conn.execute(tables)
output = conn.fetchall()
tables = [tab[0] for tab in tables]
tables
print(tables)
# Iterate through each table in the database
for table in tables:
# Print the table name
print("Table:", table)
# Get the names of the columns for the table
= "SHOW COLUMNS FROM "+table
columns = conn.execute(columns)
output = conn.fetchall()
columns = [col[0] for col in columns]
columns # Print the names of the columns
print("Columns:", columns)
# Print a blank line
print()
# This data set seems to contain data for a company who sales products
# to customers. There is information about the employees, customers,
# products, and orders.
['categories', 'customers', 'employees', 'orders', 'ordersdetails', 'products', 'shippers', 'suppliers']
Table: categories
Columns: ['CategoryID', 'CategoryName', 'DescriptionText']
Table: customers
Columns: ['CustomerID', 'CustomerName', 'ContractName', 'Address', 'City', 'PostalCode', 'Country']
Table: employees
Columns: ['EmployeeID', 'LastName', 'FirstName', 'BirthDate', 'Photo', 'Notes']
Table: orders
Columns: ['OrderID', 'CustomerID', 'EmployeeID', 'OrderDate', 'ShipperID']
Table: ordersdetails
Columns: ['OrderDetailID', 'OrderID', 'ProductID', 'Quantity']
Table: products
Columns: ['ProductID', 'ProductName', 'SuppliersID', 'CategoryID', 'Unit', 'Price']
Table: shippers
Columns: ['ShipperID', 'ShipperName', 'Phone']
Table: suppliers
Columns: ['SupplierID', 'SuppliersName', 'ContactName', 'Address', 'City', 'PostalCode', 'Country', 'Phone']
################
## QUESTION 2 ##
################
# Find the sale amount for every order. To do this you’ll
# need to find how many units were sold in each order and
# the price per unit for the corresponding item that was
# sold, then multiply the values together.
# Use an inner join to get the correct data
# We need quantity from ordersdetails, price from products,
# and order id from odersdetails
= "SELECT o.Quantity, p.Price, o.OrderID FROM ordersdetails o INNER JOIN\
inner_join products p ON o.ProductID=p.ProductID"
= conn.execute(inner_join)
output = conn.fetchall()
data
# Convert the data to a datafra,e
= pd.DataFrame(data, columns=["Quantity", "Price", "Order ID"])
data # Compute the total price of the product ordered
"Total Price"] = data["Quantity"]*data["Price"]
data[# Use a groupby to find the total price for each of the orders
# Previously we had the total price for each product ordered by some
# orders have multiple products
= data.groupby(by="Order ID")["Total Price"].sum()
total_price_order_data total_price_order_data
Order ID
10248 566.00
10249 2329.25
10250 2267.25
10251 418.50
10252 4662.50
...
10320 645.00
10321 180.00
10322 140.00
10323 205.50
10324 4186.45
Name: Total Price, Length: 77, dtype: float64
################
## QUESTION 3 ##
################
# Make a histogram of the sale amounts you found in #2.
plt.hist(total_price_order_data)"Total Order Price")
plt.xlabel("Number of Orders") plt.ylabel(
Text(0, 0.5, 'Number of Orders')
################
## QUESTION 4 ##
###############
# Generate a pandas dataframe that lists the customerID for all
# customers served by the employee with EmployeeID “8”.
# Display the dataframe.
# Get the data using the where keyword to only pull the selected data
# You can also pull all of the data and then use a mask on the resulting
# dataframe
= "SELECT CustomerID FROM orders WHERE EmployeeID=8"
employee_8 = conn.execute(employee_8)
output = conn.fetchall()
customers
# Add the data to a dataframe
= pd.DataFrame(customers, columns=["Customer ID"])
customers customers
Customer ID | |
---|---|
0 | 65 |
1 | 33 |
2 | 80 |
3 | 5 |
4 | 44 |
5 | 63 |
6 | 67 |
7 | 15 |
8 | 86 |
9 | 55 |
10 | 77 |
11 | 38 |
12 | 84 |
13 | 58 |
14 | 29 |
15 | 75 |
16 | 37 |
17 | 4 |
18 | 83 |
################
## QUESTION 5 ##
###############
# Generate a pandas dataframe that lists the Customer’s Name and
# their corresponding country of residence, again only for customers
# served by the employee with EmployeeID “8”. Display the dataframe.
# Use an inner hoin to get the needed data while also still using the where
# statement to pull only selected data
= "SELECT c.CustomerName, c.Country FROM customers c INNER\
employee_8_expanded JOIN orders o ON o.CustomerID=c.CustomerID WHERE o.EmployeeID=8"
= conn.execute(employee_8_expanded)
output = conn.fetchall()
employee_8_expanded_data
# Create a dataframe
= pd.DataFrame(employee_8_expanded_data,
employee_8_expanded_data =["Customer Name", "Country"])
columns employee_8_expanded_data
Customer Name | Country | |
---|---|---|
0 | Rattlesnake Canyon Grocery | USA |
1 | GROSELLA-Restaurante | Venezuela |
2 | Tortuga Restaurante | Mexico |
3 | Berglunds snabbköp | Sweden |
4 | Lehmanns Marktstand | Germany |
5 | QUICK-Stop | Germany |
6 | Ricardo Adocicados | Brazil |
7 | Comércio Mineiro | Brazil |
8 | Die Wandernde Kuh | Germany |
9 | Old World Delicatessen | USA |
10 | The Big Cheese | USA |
11 | Island Trading | UK |
12 | Victuailles en stock | France |
13 | Pericles Comidas clásicas | Mexico |
14 | Galería del gastrónomo | Spain |
15 | Split Rail Beer & Ale | USA |
16 | Hungry Owl All-Night Grocers | Ireland |
17 | Around the Horn | UK |
18 | Vaffeljernet | Denmark |
"SELECT o.OrderID, d.EmployeeID FROM order o INNER JOIN ordersdetails d ON o.OrderID=d.OrderID WHERE "
import scipy.stats as s
s.pearsonr()