# Dependencies
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import re
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
# Create input variables
search = input("What would you like to search: ")
zipcode = input("Where would you like to search: ")
# default web search to concatenate
default = "https://www.yelp.com/search?find_desc="
locationUrl = "&find_loc="
end = "&ns=1"
# empty lists to append to
url = []
business_name = []
business_category = []
yelp_rating = []
review_count = []
price_range = []
price_category = []
address = []
phone = []
website = []
# function to search input parameters on Yelp
def search_yelp(search, zipcode):
keyword = search;
zipcode = zipcode;
query_url = f"{default}{search}{locationUrl}{zipcode}{end}";
response = requests.get(query_url)
soup = BeautifulSoup(response.text, 'html.parser');
for link in soup.find_all('a', class_="biz-name"):
url.append("https://www.yelp.com"+link.get('href'))
url.pop(0)
# function to scrape yelp data on each page
def scrape_yelp(url):
for i in url:
response = requests.get(i)
soup = BeautifulSoup(response.text, 'html.parser')
business_name.append(soup.find('h1', attrs={'class':'biz-page-title'}).text.strip())
business_category.append(soup.find(attrs={'class': 'category-str-list'}).text.strip())
yelp_rating.append(soup.find(attrs={"class":"i-stars"})["title"])
review_count.append(soup.find(attrs={'class': 'review-count rating-qualifier'}).text.strip())
if soup.find(attrs={'class': 'business-attribute price-range'}) is not None:
price_range.append(soup.find(attrs={'class': 'business-attribute price-range'}).text.strip())
else:
price_range.append('unknown')
if soup.find(attrs={'class': 'nowrap price-description'}) is not None:
price_category.append(soup.find(attrs={'class': 'nowrap price-description'}).text.strip())
else:
price_category.append('unkown')
address.append(soup.find(attrs={'class': 'street-address'}).text.strip())
phone.append(soup.find(attrs={'class': 'biz-phone'}).text.strip())
if soup.find("a", href=lambda href: href and "biz_redir?" in href) is not None:
website.append(soup.find("a", href=lambda href: href and "biz_redir?" in href).text.strip())
else:
website.append('no website')
# create a dataframe with yelp data from yelp scrape
def create_table():
data = {
'BusinessName' : business_name,
'BusinessCategory' : business_category,
'YelpRating' : yelp_rating,
'ReviewCount' : review_count,
'PriceRange($)' : price_range,
'PriceCategory': price_category,
'Address' : address,
'Phone' : phone,
'Website' : website
}
# return data
df = pd.DataFrame.from_dict(data)
df = df[['BusinessName','BusinessCategory','YelpRating','ReviewCount','PriceRange($)', 'PriceCategory','Address',
'Phone', 'Website']]
df.to_csv(f'data/yelp_{search}{zipcode}NEW.csv')
return df
# RUN FUNCTION
search_yelp(search, zipcode)
# RUN FUNCTION
scrape_yelp(url)
# RUN FUNCTION
create_table()
# plot review count
plt.subplots(figsize=(8,8))
sns.countplot(x = 'YelpRating',
data= create_table(),
order = create_table()['YelpRating'].value_counts().index)
# business to scrape for reviews
main = 'https://www.yelp.com/biz/danckuts-irvine'
url2 = [main,
main + '?start=20',
main + '?start=40',
main + '?start=60',
main + '?start=80',
main + '?start=100',
main + '?start=120',
main + '?start=140',
main + '?start=160',
main + '?start=180',
main + '?start=200',
main + '?start=220',
main + '?start=240',
main + '?start=260',
main + '?start=280',
main + '?start=300',]
for i in url2:
print (i)
reviews = []
star_rating = []
def get_review_content(url2):
for i in url:
response = requests.get(i)
soup = BeautifulSoup(response.text, 'html.parser')
rc_unstripped = soup.find_all(attrs={'class': 'review-content'})
rc_length = (len(rc_unstripped))
for i in range (0, rc_length):
reviews.append(rc_unstripped[i].text.strip())
for rating in soup.select(".rating-large"):
star_rating.append(rating.get("title"))
# Run the function
get_review_content(url2)
# convert reviews to pd series
reviews_s = pd.Series(reviews)
# convert ratings to pd series
ratings_s = pd.Series(star_rating)
# create a pandas df from reviews
data = {'Reviews' : reviews_s,
'StarRating': ratings_s}
df2 = pd.DataFrame.from_dict(data)
df2 = df2[['Reviews', 'StarRating']]
# split date and reviews
df2['Date'] = df2.Reviews.str.split('\n').str.get(0)
df2['Review'] = df2.Reviews.str.split('\n\n').str.get(1)
# drop Reviews column
df2.drop('Reviews', axis=1, inplace=True)
# delete NaN rows
df3 = df2.replace(r"", np.nan)
# drop NaN rows
df4 = df3.dropna()
# reset index
df4.reset_index(drop=True, inplace=True)
from textblob import TextBlob
# convert Review column in df4 to list
review_list = df4['Review'].tolist()
sentiment_score = []
# iterate through review list
for r in review_list:
sentiment = TextBlob(r)
sentiment_score.append(sentiment.sentiment.polarity)
# convert to series
sentiment_score_convert = pd.Series(sentiment_score)
# add sentiment score to sentiment_df
df4.insert(loc=1, column='SentimentScore', value=sentiment_score_convert)
sentiment = []
# iterate through sentiment_score to assign a pos or neg val
for s in sentiment_score:
if s > 0:
sentiment.append('Positive')
else:
sentiment.append('Negative')
# convert sentiment to a pandas series
sentiment_2 = pd.Series(sentiment)
# add sentiment score to sentiment_df
df4.insert(loc=2, column='Sentiment', value=sentiment_2)
# convert Sentiment label to a numerical value
df4['SentimentNum'] = df4.Sentiment.map({'Positive': 0,
'Negative': 1})
# extract business name from "main" url
bizname_split = main.split('https://www.yelp.com/biz/')
bizname = bizname_split[1]
bizname
# split Star Rating and create a new column
df4['Rating'] = df4.StarRating.str.split('star rating').str.get(0)
# reorder columns
df4 = df4[['Date', 'SentimentScore', 'Sentiment', 'SentimentNum', 'StarRating', 'Rating','Review']]
df4.head()