# Dependencies
import requests
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import re

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import warnings
warnings.filterwarnings('ignore')

# Create input variables
search = input("What would you like to search: ")
zipcode = input("Where would you like to search: ")

What would you like to search: Barbers
Where would you like to search: 92620

# default web search to concatenate
default = "https://www.yelp.com/search?find_desc="
locationUrl = "&find_loc="
end = "&ns=1"

# empty lists to append to
url = []
business_name = []
business_category = []
yelp_rating = []
review_count = []
price_range = []
price_category = []
address = []
phone = []
website = []

# function to search input parameters on Yelp
def search_yelp(search, zipcode):
    
    keyword = search;
    zipcode = zipcode;
    query_url = f"{default}{search}{locationUrl}{zipcode}{end}";
    response = requests.get(query_url)
    soup = BeautifulSoup(response.text, 'html.parser');
    for link in soup.find_all('a', class_="biz-name"):
        url.append("https://www.yelp.com"+link.get('href'))
    url.pop(0)

# function to scrape yelp data on each page
def scrape_yelp(url):
    for i in url:
        response = requests.get(i)
        soup = BeautifulSoup(response.text, 'html.parser')
        business_name.append(soup.find('h1', attrs={'class':'biz-page-title'}).text.strip())
        business_category.append(soup.find(attrs={'class': 'category-str-list'}).text.strip())
        yelp_rating.append(soup.find(attrs={"class":"i-stars"})["title"])
        review_count.append(soup.find(attrs={'class': 'review-count rating-qualifier'}).text.strip())
        
        if soup.find(attrs={'class': 'business-attribute price-range'}) is not None:
            price_range.append(soup.find(attrs={'class': 'business-attribute price-range'}).text.strip())
        else:
            price_range.append('unknown')
        
        if soup.find(attrs={'class': 'nowrap price-description'}) is not None:
            price_category.append(soup.find(attrs={'class': 'nowrap price-description'}).text.strip())
        else:
            price_category.append('unkown')
        
        address.append(soup.find(attrs={'class': 'street-address'}).text.strip())
        phone.append(soup.find(attrs={'class': 'biz-phone'}).text.strip())
        
        if soup.find("a", href=lambda href: href and "biz_redir?" in href) is not None:
            website.append(soup.find("a", href=lambda href: href and "biz_redir?" in href).text.strip())
        else:
            website.append('no website')

# create a dataframe with yelp data from yelp scrape
def create_table():
    data = {
    'BusinessName' : business_name,
    'BusinessCategory' : business_category,
    'YelpRating' : yelp_rating,
    'ReviewCount' : review_count,
    'PriceRange($)' : price_range,
    'PriceCategory': price_category,
    'Address' : address, 
    'Phone' : phone,
    'Website' : website       
    }
    
    # return data
    df = pd.DataFrame.from_dict(data) 
    
    df = df[['BusinessName','BusinessCategory','YelpRating','ReviewCount','PriceRange($)', 'PriceCategory','Address',
            'Phone', 'Website']]
    df.to_csv(f'data/yelp_{search}{zipcode}NEW.csv')
    
    return df

# RUN FUNCTION
search_yelp(search, zipcode)

# RUN FUNCTION
scrape_yelp(url)

# RUN FUNCTION
create_table()

# plot review count
plt.subplots(figsize=(8,8))
sns.countplot(x = 'YelpRating',
             data= create_table(),
             order = create_table()['YelpRating'].value_counts().index)

(<matplotlib.figure.Figure at 0x1185d5b00>,
 <matplotlib.axes._subplots.AxesSubplot at 0x117dbeba8>)

<matplotlib.axes._subplots.AxesSubplot at 0x117dbeba8>

# business to scrape for reviews
main = 'https://www.yelp.com/biz/danckuts-irvine'
url2 = [main,
       main + '?start=20',
       main + '?start=40',
       main + '?start=60',
       main + '?start=80',
       main + '?start=100',
       main + '?start=120',
       main + '?start=140',
       main + '?start=160',
       main + '?start=180',
       main + '?start=200',
       main + '?start=220',
       main + '?start=240',
       main + '?start=260',
       main + '?start=280',
       main + '?start=300',]

for i in url2:
    print (i)

https://www.yelp.com/biz/danckuts-irvine
https://www.yelp.com/biz/danckuts-irvine?start=20
https://www.yelp.com/biz/danckuts-irvine?start=40
https://www.yelp.com/biz/danckuts-irvine?start=60
https://www.yelp.com/biz/danckuts-irvine?start=80
https://www.yelp.com/biz/danckuts-irvine?start=100
https://www.yelp.com/biz/danckuts-irvine?start=120
https://www.yelp.com/biz/danckuts-irvine?start=140
https://www.yelp.com/biz/danckuts-irvine?start=160
https://www.yelp.com/biz/danckuts-irvine?start=180
https://www.yelp.com/biz/danckuts-irvine?start=200
https://www.yelp.com/biz/danckuts-irvine?start=220
https://www.yelp.com/biz/danckuts-irvine?start=240
https://www.yelp.com/biz/danckuts-irvine?start=260
https://www.yelp.com/biz/danckuts-irvine?start=280
https://www.yelp.com/biz/danckuts-irvine?start=300

Start Review Section¶

reviews = []
star_rating = []

def get_review_content(url2):
    for i in url:
        response = requests.get(i)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        rc_unstripped = soup.find_all(attrs={'class': 'review-content'})
        rc_length = (len(rc_unstripped))
        
        for i in range (0, rc_length):
            reviews.append(rc_unstripped[i].text.strip()) 
        
        for rating in soup.select(".rating-large"):
            star_rating.append(rating.get("title"))

# Run the function
get_review_content(url2)

# convert reviews to pd series
reviews_s = pd.Series(reviews)

# convert ratings to pd series
ratings_s = pd.Series(star_rating)

# create a pandas df from reviews
data = {'Reviews' : reviews_s,
       'StarRating': ratings_s}

df2 = pd.DataFrame.from_dict(data)
df2 = df2[['Reviews', 'StarRating']]  

# split date and reviews
df2['Date'] = df2.Reviews.str.split('\n').str.get(0)
df2['Review'] = df2.Reviews.str.split('\n\n').str.get(1)

# drop Reviews column
df2.drop('Reviews', axis=1, inplace=True)

# delete NaN rows
df3 = df2.replace(r"", np.nan)

# drop NaN rows
df4 = df3.dropna()

# reset index
df4.reset_index(drop=True, inplace=True)

Pandas DF and Sentiment Section¶

from textblob import TextBlob

# convert Review column in df4 to list
review_list = df4['Review'].tolist()

sentiment_score = []
# iterate through review list
for r in review_list:
    sentiment = TextBlob(r)
    sentiment_score.append(sentiment.sentiment.polarity)

# convert to series
sentiment_score_convert = pd.Series(sentiment_score)

# add sentiment score to sentiment_df
df4.insert(loc=1, column='SentimentScore', value=sentiment_score_convert)

sentiment = []
# iterate through sentiment_score to assign a pos or neg val
for s in sentiment_score:
    if s > 0:
        sentiment.append('Positive')
    else:
        sentiment.append('Negative')

# convert sentiment to a pandas series
sentiment_2 = pd.Series(sentiment)

# add sentiment score to sentiment_df
df4.insert(loc=2, column='Sentiment', value=sentiment_2)

# convert Sentiment label to a numerical value
df4['SentimentNum'] = df4.Sentiment.map({'Positive': 0,
                                                 'Negative': 1})

# extract business name from "main" url
bizname_split = main.split('https://www.yelp.com/biz/')
bizname = bizname_split[1]
bizname

# split Star Rating and create a new column
df4['Rating'] = df4.StarRating.str.split('star rating').str.get(0)

# reorder columns
df4 = df4[['Date', 'SentimentScore', 'Sentiment', 'SentimentNum', 'StarRating', 'Rating','Review']]
df4.head()

'danckuts-irvine'

	BusinessName	BusinessCategory	YelpRating	ReviewCount	PriceRange($)	PriceCategory	Address	Phone	Website
0	Omar the Barber	Barbers	4.5 star rating	306 reviews	$	Inexpensive	13925 Yale AveKija Hair SalonIrvine, CA 92620	(714) 618-1133	omarthebarber.com
1	Danckuts	Barbers	4.5 star rating	302 reviews	$$	Moderate	Located inTrabuco Plaza	(949) 400-0444	danckuts.com
2	Stonecreek Plaza Barber Shop	Barbers	5.0 star rating	139 reviews	$	Inexpensive	Located inStonecreek Plaza	(949) 786-6404	stonecreekplazabarbershop.com
3	Justin’s Fades & Hairstyles	Barbers	5.0 star rating	46 reviews	$$	Moderate	4330 Barranca PkwySte 105Irvine, CA 92604	(949) 973-1601	justins-fadeshairstyles.busin…
4	Roosters Men’s Grooming Center	Barbers	4.0 star rating	169 reviews	$$	Moderate	5405 Alton Pkwy Ste DIrvine, CA 92604	(949) 551-5909	roostersmgc.com
5	Hair Days	Hair Salons	4.5 star rating	160 reviews	$$	Moderate	14231 Jeffrey RdIrvine, CA 92620	(949) 733-3700	hairdaysbeauty.com
6	The Sharper Look	Hair Salons,\n Skin Care,\n...	5.0 star rating	63 reviews	$$	Moderate	15333 Culver DrSte 21Irvine, CA 92604	(949) 829-3288	thesharperlook.com
7	Made Man Salon	Men's Hair Salons,\n Waxing...	5.0 star rating	23 reviews	$$	Moderate	2801 El Camino RealSte 28Tustin, CA 92782	(949) 290-8122	mademansalon.com
8	Sport Clips Haircuts of Irvine	Barbers,\n Men's Hair Salons	4.0 star rating	191 reviews	$$	Moderate	6274 Irvine BlvdIrvine, CA 92620	(949) 748-8228	haircutmenirvineca.com
9	Sand Canyon Barber	Barbers	3.5 star rating	93 reviews	$$	Moderate	6642 Irvine Center DrIrvine, CA 92618	(949) 727-3000	sandcanyonbarbershop.com

	Date	SentimentScore	Sentiment	StarRating	Rating	Review
0	4/6/2018	0.246528	Positive	5.0 star rating	5.0	Omar is the definition of quick, efficient, an...
1	2/28/2018	0.273810	Positive	5.0 star rating	5.0	I rarely leave Yelp reviews. After seeing how ...
2	2/5/2018	0.040833	Positive	5.0 star rating	5.0	Thanks to Reddit, I learned of Omar about 2 ho...
3	4/9/2018	0.247222	Positive	5.0 star rating	5.0	Omar is not only a great barber. He is super f...
4	4/8/2018	0.450000	Positive	5.0 star rating	5.0	Great guy and great cuts. Cash only though so ...