In [1]:
# Dependencies
import requests
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import re

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import warnings
warnings.filterwarnings('ignore')
In [2]:
# Create input variables
search = input("What would you like to search: ")
zipcode = input("Where would you like to search: ")
What would you like to search: Barbers
Where would you like to search: 92620
In [3]:
# default web search to concatenate
default = "https://www.yelp.com/search?find_desc="
locationUrl = "&find_loc="
end = "&ns=1"
In [4]:
# empty lists to append to
url = []
business_name = []
business_category = []
yelp_rating = []
review_count = []
price_range = []
price_category = []
address = []
phone = []
website = []
In [5]:
# function to search input parameters on Yelp
def search_yelp(search, zipcode):
    
    keyword = search;
    zipcode = zipcode;
    query_url = f"{default}{search}{locationUrl}{zipcode}{end}";
    response = requests.get(query_url)
    soup = BeautifulSoup(response.text, 'html.parser');
    for link in soup.find_all('a', class_="biz-name"):
        url.append("https://www.yelp.com"+link.get('href'))
    url.pop(0)
In [6]:
# function to scrape yelp data on each page
def scrape_yelp(url):
    for i in url:
        response = requests.get(i)
        soup = BeautifulSoup(response.text, 'html.parser')
        business_name.append(soup.find('h1', attrs={'class':'biz-page-title'}).text.strip())
        business_category.append(soup.find(attrs={'class': 'category-str-list'}).text.strip())
        yelp_rating.append(soup.find(attrs={"class":"i-stars"})["title"])
        review_count.append(soup.find(attrs={'class': 'review-count rating-qualifier'}).text.strip())
        
        if soup.find(attrs={'class': 'business-attribute price-range'}) is not None:
            price_range.append(soup.find(attrs={'class': 'business-attribute price-range'}).text.strip())
        else:
            price_range.append('unknown')
        
        if soup.find(attrs={'class': 'nowrap price-description'}) is not None:
            price_category.append(soup.find(attrs={'class': 'nowrap price-description'}).text.strip())
        else:
            price_category.append('unkown')
        
        address.append(soup.find(attrs={'class': 'street-address'}).text.strip())
        phone.append(soup.find(attrs={'class': 'biz-phone'}).text.strip())
        
        if soup.find("a", href=lambda href: href and "biz_redir?" in href) is not None:
            website.append(soup.find("a", href=lambda href: href and "biz_redir?" in href).text.strip())
        else:
            website.append('no website')       
In [7]:
# create a dataframe with yelp data from yelp scrape
def create_table():
    data = {
    'BusinessName' : business_name,
    'BusinessCategory' : business_category,
    'YelpRating' : yelp_rating,
    'ReviewCount' : review_count,
    'PriceRange($)' : price_range,
    'PriceCategory': price_category,
    'Address' : address, 
    'Phone' : phone,
    'Website' : website       
    }
    
    # return data
    df = pd.DataFrame.from_dict(data) 
    
    df = df[['BusinessName','BusinessCategory','YelpRating','ReviewCount','PriceRange($)', 'PriceCategory','Address',
            'Phone', 'Website']]
    df.to_csv(f'data/yelp_{search}{zipcode}NEW.csv')
    
    return df
In [8]:
# RUN FUNCTION
search_yelp(search, zipcode)

# RUN FUNCTION
scrape_yelp(url)

# RUN FUNCTION
create_table()
Out[8]:
BusinessName BusinessCategory YelpRating ReviewCount PriceRange($) PriceCategory Address Phone Website
0 Omar the Barber Barbers 4.5 star rating 306 reviews $ Inexpensive 13925 Yale AveKija Hair SalonIrvine, CA 92620 (714) 618-1133 omarthebarber.com
1 Danckuts Barbers 4.5 star rating 302 reviews $$ Moderate Located inTrabuco Plaza (949) 400-0444 danckuts.com
2 Stonecreek Plaza Barber Shop Barbers 5.0 star rating 139 reviews $ Inexpensive Located inStonecreek Plaza (949) 786-6404 stonecreekplazabarbershop.com
3 Justin’s Fades & Hairstyles Barbers 5.0 star rating 46 reviews $$ Moderate 4330 Barranca PkwySte 105Irvine, CA 92604 (949) 973-1601 justins-fadeshairstyles.busin…
4 Roosters Men’s Grooming Center Barbers 4.0 star rating 169 reviews $$ Moderate 5405 Alton Pkwy Ste DIrvine, CA 92604 (949) 551-5909 roostersmgc.com
5 Hair Days Hair Salons 4.5 star rating 160 reviews $$ Moderate 14231 Jeffrey RdIrvine, CA 92620 (949) 733-3700 hairdaysbeauty.com
6 The Sharper Look Hair Salons,\n Skin Care,\n... 5.0 star rating 63 reviews $$ Moderate 15333 Culver DrSte 21Irvine, CA 92604 (949) 829-3288 thesharperlook.com
7 Made Man Salon Men's Hair Salons,\n Waxing... 5.0 star rating 23 reviews $$ Moderate 2801 El Camino RealSte 28Tustin, CA 92782 (949) 290-8122 mademansalon.com
8 Sport Clips Haircuts of Irvine Barbers,\n Men's Hair Salons 4.0 star rating 191 reviews $$ Moderate 6274 Irvine BlvdIrvine, CA 92620 (949) 748-8228 haircutmenirvineca.com
9 Sand Canyon Barber Barbers 3.5 star rating 93 reviews $$ Moderate 6642 Irvine Center DrIrvine, CA 92618 (949) 727-3000 sandcanyonbarbershop.com
In [9]:
# plot review count
plt.subplots(figsize=(8,8))
sns.countplot(x = 'YelpRating',
             data= create_table(),
             order = create_table()['YelpRating'].value_counts().index)
Out[9]:
(<matplotlib.figure.Figure at 0x1185d5b00>,
 <matplotlib.axes._subplots.AxesSubplot at 0x117dbeba8>)
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x117dbeba8>
In [10]:
# business to scrape for reviews
main = 'https://www.yelp.com/biz/danckuts-irvine'
url2 = [main,
       main + '?start=20',
       main + '?start=40',
       main + '?start=60',
       main + '?start=80',
       main + '?start=100',
       main + '?start=120',
       main + '?start=140',
       main + '?start=160',
       main + '?start=180',
       main + '?start=200',
       main + '?start=220',
       main + '?start=240',
       main + '?start=260',
       main + '?start=280',
       main + '?start=300',]
In [11]:
for i in url2:
    print (i)
https://www.yelp.com/biz/danckuts-irvine
https://www.yelp.com/biz/danckuts-irvine?start=20
https://www.yelp.com/biz/danckuts-irvine?start=40
https://www.yelp.com/biz/danckuts-irvine?start=60
https://www.yelp.com/biz/danckuts-irvine?start=80
https://www.yelp.com/biz/danckuts-irvine?start=100
https://www.yelp.com/biz/danckuts-irvine?start=120
https://www.yelp.com/biz/danckuts-irvine?start=140
https://www.yelp.com/biz/danckuts-irvine?start=160
https://www.yelp.com/biz/danckuts-irvine?start=180
https://www.yelp.com/biz/danckuts-irvine?start=200
https://www.yelp.com/biz/danckuts-irvine?start=220
https://www.yelp.com/biz/danckuts-irvine?start=240
https://www.yelp.com/biz/danckuts-irvine?start=260
https://www.yelp.com/biz/danckuts-irvine?start=280
https://www.yelp.com/biz/danckuts-irvine?start=300

Start Review Section

In [12]:
reviews = []
star_rating = []
In [13]:
def get_review_content(url2):
    for i in url:
        response = requests.get(i)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        rc_unstripped = soup.find_all(attrs={'class': 'review-content'})
        rc_length = (len(rc_unstripped))
        
        for i in range (0, rc_length):
            reviews.append(rc_unstripped[i].text.strip()) 
        
        for rating in soup.select(".rating-large"):
            star_rating.append(rating.get("title"))
In [14]:
# Run the function
get_review_content(url2)
In [15]:
# convert reviews to pd series
reviews_s = pd.Series(reviews)

# convert ratings to pd series
ratings_s = pd.Series(star_rating)

# create a pandas df from reviews
data = {'Reviews' : reviews_s,
       'StarRating': ratings_s}

df2 = pd.DataFrame.from_dict(data)
df2 = df2[['Reviews', 'StarRating']]  

# split date and reviews
df2['Date'] = df2.Reviews.str.split('\n').str.get(0)
df2['Review'] = df2.Reviews.str.split('\n\n').str.get(1)

# drop Reviews column
df2.drop('Reviews', axis=1, inplace=True)

# delete NaN rows
df3 = df2.replace(r"", np.nan)

# drop NaN rows
df4 = df3.dropna()

# reset index
df4.reset_index(drop=True, inplace=True)

Pandas DF and Sentiment Section

In [16]:
from textblob import TextBlob

# convert Review column in df4 to list
review_list = df4['Review'].tolist()

sentiment_score = []
# iterate through review list
for r in review_list:
    sentiment = TextBlob(r)
    sentiment_score.append(sentiment.sentiment.polarity)

# convert to series
sentiment_score_convert = pd.Series(sentiment_score)

# add sentiment score to sentiment_df
df4.insert(loc=1, column='SentimentScore', value=sentiment_score_convert)

sentiment = []
# iterate through sentiment_score to assign a pos or neg val
for s in sentiment_score:
    if s > 0:
        sentiment.append('Positive')
    else:
        sentiment.append('Negative')

# convert sentiment to a pandas series
sentiment_2 = pd.Series(sentiment)

# add sentiment score to sentiment_df
df4.insert(loc=2, column='Sentiment', value=sentiment_2)

# convert Sentiment label to a numerical value
df4['SentimentNum'] = df4.Sentiment.map({'Positive': 0,
                                                 'Negative': 1})

# extract business name from "main" url
bizname_split = main.split('https://www.yelp.com/biz/')
bizname = bizname_split[1]
bizname

# split Star Rating and create a new column
df4['Rating'] = df4.StarRating.str.split('star rating').str.get(0)

# reorder columns
df4 = df4[['Date', 'SentimentScore', 'Sentiment', 'SentimentNum', 'StarRating', 'Rating','Review']]
df4.head()
Out[16]:
'danckuts-irvine'
Out[16]:
Date SentimentScore Sentiment SentimentNum StarRating Rating Review
0 4/6/2018 0.246528 Positive 0 5.0 star rating 5.0 Omar is the definition of quick, efficient, an...
1 2/28/2018 0.273810 Positive 0 5.0 star rating 5.0 I rarely leave Yelp reviews. After seeing how ...
2 2/5/2018 0.040833 Positive 0 5.0 star rating 5.0 Thanks to Reddit, I learned of Omar about 2 ho...
3 4/9/2018 0.247222 Positive 0 5.0 star rating 5.0 Omar is not only a great barber. He is super f...
4 4/8/2018 0.450000 Positive 0 5.0 star rating 5.0 Great guy and great cuts. Cash only though so ...