Unverified Commit 8e753a32 authored by Athokshay Ashok's avatar Athokshay Ashok Committed by GitHub
Browse files

Add files via upload

parent dfdaed24
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# IBM Capstone Project"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: BeautifulSoup4 in c:\\users\\athok\\miniconda3\\envs\\ml135_env\\lib\\site-packages (4.9.1)\n",
"Requirement already satisfied: soupsieve>1.2 in c:\\users\\athok\\miniconda3\\envs\\ml135_env\\lib\\site-packages (from BeautifulSoup4) (2.0.1)\n",
"Requirement already satisfied: requests in c:\\users\\athok\\miniconda3\\envs\\ml135_env\\lib\\site-packages (2.24.0)\n",
"Requirement already satisfied: idna<3,>=2.5 in c:\\users\\athok\\miniconda3\\envs\\ml135_env\\lib\\site-packages (from requests) (2.10)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\athok\\miniconda3\\envs\\ml135_env\\lib\\site-packages (from requests) (1.25.10)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in c:\\users\\athok\\miniconda3\\envs\\ml135_env\\lib\\site-packages (from requests) (3.0.4)\n",
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\athok\\miniconda3\\envs\\ml135_env\\lib\\site-packages (from requests) (2019.11.28)\n",
"Requirement already satisfied: lxml in c:\\users\\athok\\miniconda3\\envs\\ml135_env\\lib\\site-packages (4.5.2)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"ERROR: Could not find a version that satisfies the requirement xml (from versions: none)\n",
"ERROR: No matching distribution found for xml\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"!pip install BeautifulSoup4\n",
"!pip install requests\n",
"!pip install lxml"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Hello Capstone Project Course!\n"
]
}
],
"source": [
"print('Hello Capstone Project Course!')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Week 3: Segmenting and Clustering the Neighborhoods in the City of Toronto, Canada"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Postal Code</th>\n",
" <th>Borough</th>\n",
" <th>Neighbourhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M1A</td>\n",
" <td>Not assigned</td>\n",
" <td>Not assigned</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M2A</td>\n",
" <td>Not assigned</td>\n",
" <td>Not assigned</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M3A</td>\n",
" <td>North York</td>\n",
" <td>Parkwoods</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M4A</td>\n",
" <td>North York</td>\n",
" <td>Victoria Village</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Regent Park, Harbourfront</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Postal Code Borough Neighbourhood\n",
"0 M1A Not assigned Not assigned\n",
"1 M2A Not assigned Not assigned\n",
"2 M3A North York Parkwoods\n",
"3 M4A North York Victoria Village\n",
"4 M5A Downtown Toronto Regent Park, Harbourfront"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from bs4 import BeautifulSoup\n",
"import requests\n",
"\n",
"#Use Beautiful Soup to extract page text\n",
"source = requests.get(\"https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M\")\n",
"soup = BeautifulSoup(source.text, 'html.parser')\n",
"\n",
"#Find table in HTML and extract all data into rows\n",
"data = []\n",
"columns = []\n",
"table = soup.find(class_='wikitable')\n",
"for index, tr in enumerate(table.find_all('tr')):\n",
" section = []\n",
" for td in tr.find_all(['th','td']):\n",
" section.append(td.text.rstrip())\n",
" \n",
" if (index == 0):\n",
" columns = section\n",
" else:\n",
" data.append(section)\n",
"\n",
"canada_df = pd.DataFrame(data = data,columns = columns)\n",
"canada_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Postal Code</th>\n",
" <th>Borough</th>\n",
" <th>Neighbourhood</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M3A</td>\n",
" <td>North York</td>\n",
" <td>Parkwoods</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M4A</td>\n",
" <td>North York</td>\n",
" <td>Victoria Village</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>M5A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Regent Park, Harbourfront</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>M6A</td>\n",
" <td>North York</td>\n",
" <td>Lawrence Manor, Lawrence Heights</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>M7A</td>\n",
" <td>Downtown Toronto</td>\n",
" <td>Queen's Park, Ontario Provincial Government</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Postal Code Borough Neighbourhood\n",
"2 M3A North York Parkwoods\n",
"3 M4A North York Victoria Village\n",
"4 M5A Downtown Toronto Regent Park, Harbourfront\n",
"5 M6A North York Lawrence Manor, Lawrence Heights\n",
"6 M7A Downtown Toronto Queen's Park, Ontario Provincial Government"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Remove all rows where borough is not assigned\n",
"canada_df = canada_df[canada_df['Borough'] != 'Not assigned']\n",
"canada_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, \n",
"# you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. \n",
"# These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in \n",
"#the above table.\n",
"\n",
"# This did not need to be addressed since the data was already grouped by postal code with all the corresponding neighborhoods."
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Borough</th>\n",
" <th>Neighbourhood</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Postal Code</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>M3A</th>\n",
" <td>North York</td>\n",
" <td>Parkwoods</td>\n",
" </tr>\n",
" <tr>\n",
" <th>M4A</th>\n",
" <td>North York</td>\n",
" <td>Victoria Village</td>\n",
" </tr>\n",
" <tr>\n",
" <th>M5A</th>\n",
" <td>Downtown Toronto</td>\n",
" <td>Regent Park, Harbourfront</td>\n",
" </tr>\n",
" <tr>\n",
" <th>M6A</th>\n",
" <td>North York</td>\n",
" <td>Lawrence Manor, Lawrence Heights</td>\n",
" </tr>\n",
" <tr>\n",
" <th>M7A</th>\n",
" <td>Downtown Toronto</td>\n",
" <td>Queen's Park, Ontario Provincial Government</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Borough Neighbourhood\n",
"Postal Code \n",
"M3A North York Parkwoods\n",
"M4A North York Victoria Village\n",
"M5A Downtown Toronto Regent Park, Harbourfront\n",
"M6A North York Lawrence Manor, Lawrence Heights\n",
"M7A Downtown Toronto Queen's Park, Ontario Provincial Government"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Update index to be postcode\n",
"if(canada_df.index.name != 'Postal Code'):\n",
" canada_df = canada_df.set_index('Postal Code')\n",
" \n",
"canada_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Borough</th>\n",
" <th>Neighbourhood</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Postal Code</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>M3A</th>\n",
" <td>North York</td>\n",
" <td>Parkwoods</td>\n",
" </tr>\n",
" <tr>\n",
" <th>M4A</th>\n",
" <td>North York</td>\n",
" <td>Victoria Village</td>\n",
" </tr>\n",
" <tr>\n",
" <th>M5A</th>\n",
" <td>Downtown Toronto</td>\n",
" <td>Regent Park, Harbourfront</td>\n",
" </tr>\n",
" <tr>\n",
" <th>M6A</th>\n",
" <td>North York</td>\n",
" <td>Lawrence Manor, Lawrence Heights</td>\n",
" </tr>\n",
" <tr>\n",
" <th>M7A</th>\n",
" <td>Downtown Toronto</td>\n",
" <td>Queen's Park, Ontario Provincial Government</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Borough Neighbourhood\n",
"Postal Code \n",
"M3A North York Parkwoods\n",
"M4A North York Victoria Village\n",
"M5A Downtown Toronto Regent Park, Harbourfront\n",
"M6A North York Lawrence Manor, Lawrence Heights\n",
"M7A Downtown Toronto Queen's Park, Ontario Provincial Government"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.\n",
"canada_df['Neighbourhood'].replace(\"Not assigned\", canada_df[\"Borough\"],inplace=True)\n",
"canada_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(103, 2)"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"canada_df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.1"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
%% Cell type:markdown id: tags:
# IBM Capstone Project
%% Cell type:code id: tags:
``` python
import numpy as np
import pandas as pd
!pip install BeautifulSoup4
!pip install requests
!pip install lxml
```
%%%% Output: stream
Requirement already satisfied: BeautifulSoup4 in c:\users\athok\miniconda3\envs\ml135_env\lib\site-packages (4.9.1)
Requirement already satisfied: soupsieve>1.2 in c:\users\athok\miniconda3\envs\ml135_env\lib\site-packages (from BeautifulSoup4) (2.0.1)
Requirement already satisfied: requests in c:\users\athok\miniconda3\envs\ml135_env\lib\site-packages (2.24.0)
Requirement already satisfied: idna<3,>=2.5 in c:\users\athok\miniconda3\envs\ml135_env\lib\site-packages (from requests) (2.10)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\users\athok\miniconda3\envs\ml135_env\lib\site-packages (from requests) (1.25.10)
Requirement already satisfied: chardet<4,>=3.0.2 in c:\users\athok\miniconda3\envs\ml135_env\lib\site-packages (from requests) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\athok\miniconda3\envs\ml135_env\lib\site-packages (from requests) (2019.11.28)
Requirement already satisfied: lxml in c:\users\athok\miniconda3\envs\ml135_env\lib\site-packages (4.5.2)
%%%% Output: stream
ERROR: Could not find a version that satisfies the requirement xml (from versions: none)
ERROR: No matching distribution found for xml
%% Cell type:code id: tags:
``` python
print('Hello Capstone Project Course!')
```
%%%% Output: stream
Hello Capstone Project Course!
%% Cell type:markdown id: tags:
## Week 3: Segmenting and Clustering the Neighborhoods in the City of Toronto, Canada
%% Cell type:code id: tags:
``` python
from bs4 import BeautifulSoup
import requests
#Use Beautiful Soup to extract page text
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(source.text, 'html.parser')
#Find table in HTML and extract all data into rows
data = []
columns = []
table = soup.find(class_='wikitable')
for index, tr in enumerate(table.find_all('tr')):
section = []
for td in tr.find_all(['th','td']):
section.append(td.text.rstrip())
if (index == 0):
columns = section
else:
data.append(section)
canada_df = pd.DataFrame(data = data,columns = columns)
canada_df.head()
```
%%%% Output: execute_result
Postal Code Borough Neighbourhood
0 M1A Not assigned Not assigned
1 M2A Not assigned Not assigned
2 M3A North York Parkwoods
3 M4A North York Victoria Village
4 M5A Downtown Toronto Regent Park, Harbourfront
%% Cell type:code id: tags:
``` python
#Remove all rows where borough is not assigned
canada_df = canada_df[canada_df['Borough'] != 'Not assigned']
canada_df.head()
```
%%%% Output: execute_result
Postal Code Borough Neighbourhood
2 M3A North York Parkwoods
3 M4A North York Victoria Village
4 M5A Downtown Toronto Regent Park, Harbourfront
5 M6A North York Lawrence Manor, Lawrence Heights
6 M7A Downtown Toronto Queen's Park, Ontario Provincial Government
%% Cell type:code id: tags:
``` python
# More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page,
# you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park.
# These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in
#the above table.
# This did not need to be addressed since the data was already grouped by postal code with all the corresponding neighborhoods.
```
%% Cell type:code id: tags:
``` python
#Update index to be postcode
if(canada_df.index.name != 'Postal Code'):
canada_df = canada_df.set_index('Postal Code')
canada_df.head()
```
%%%% Output: execute_result
Borough Neighbourhood
Postal Code
M3A North York Parkwoods
M4A North York Victoria Village
M5A Downtown Toronto Regent Park, Harbourfront
M6A North York Lawrence Manor, Lawrence Heights
M7A Downtown Toronto Queen's Park, Ontario Provincial Government
%% Cell type:code id: tags:
``` python
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
canada_df['Neighbourhood'].replace("Not assigned", canada_df["Borough"],inplace=True)
canada_df.head()
```
%%%% Output: execute_result
Borough Neighbourhood
Postal Code
M3A North York Parkwoods
M4A North York Victoria Village
M5A Downtown Toronto Regent Park, Harbourfront
M6A North York Lawrence Manor, Lawrence Heights
M7A Downtown Toronto Queen's Park, Ontario Provincial Government
%% Cell type:code id: tags:
``` python
canada_df.shape
```
%%%% Output: execute_result
(103, 2)
%% Cell type:code id: tags:
``` python
```
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment