This code reads in the responses from the Google Forms Survey below. The survey responses are exported in CSV format.

The numeric_survey function below takes the responses and converts them to numeric values. This allows us to perform calculations on the responses to find correlations and create a GLM.


Google Forms Survey

title


In [2]:
# dependencies
import pandas as pd
import numpy as np
In [3]:
# import googlespreadsheet from survey

googlecsv = pd.read_csv('../data/survey041718.csv')
In [7]:
# numeric scale legends

# gender numeric legend
gender_scale = pd.DataFrame({
    'gender':['Male','Female','Other',''],
    'gender_values':['1','0','2','3']
})
gender_scale
#gender_scale.to_csv('gender_scale.csv',index=False)

# race numeric legend
race_scale = pd.DataFrame({
    'race':['Asian / Pacific Islander',
    'Black or African American',
    'Hispanic or Latino',
    'White',
    'Other',''
    ],
    'race_values':[1,2,3,4,5,0]
})
race_scale
#race_scale.to_csv('race_scale.csv',index=False)

#income numeric legend
income_scale = pd.DataFrame({
    'income':['','Less than $25,000',
    '$25,000 to $34,999',
    '$35,000 to $49,999',
    '$50,000 to $74,999',
    '$75,000 to $99,999',
    '$100,000 to $149,999',
    '$150,000 to $199,999',
    '$200,000 or more'
    ], 'income_values':[0,1,2,3,4,5,6,7,8]})
income_scale
#income_scale.to_csv('income_scale.csv',index=False)

# days since last haircut numeric legend
days_last_scale = pd.DataFrame({
    'days_last_haircut':['','1 week',
    '2 weeks',
    '3 weeks',
    '1 month',
    '2 months ago',
    '6 weeks',
    '3+ months',
    'other'
    ],'days_last_values':[0,1,2,3,4,5,6,7,8]
})
days_last_scale
#days_last_scale.to_csv('days_last_scale.csv',index=False)


# time between haircuts numeric legend
time_between_scale = pd.DataFrame({
    'time_between_haircut':['','1 week',
    '2 weeks',
    '3 weeks',
    '1 month',
    '2 months ago',
    '6 weeks',
    '3+ months',
    'other'
    ],'time_between_values':[0,1,2,3,4,5,6,7,8]
})
time_between_scale
#time_between_scale.to_csv('time_between_scale.csv',index=False)


# purchased products numeric legend
products_scale = pd.DataFrame({
    'products':['Yes','No'],
    'products_values':[1,0]
})
products_scale
#products_scale.to_csv('products_scale.csv',index=False)


# how much spent numeric scale
spend_scale = pd.DataFrame({
    'did_spend':['','10',
    '11 - 20',
    '21 - 30',
    '31 - 40',
    '40 +',
    'other',
    ],
    'spend_values':[0,1,2,3,4,5,6]
})
spend_scale
#spend_scale.to_csv('spend_scale.csv',index=False)


# max spend numeric scale
max_spend_scale = pd.DataFrame({
    'max_spend':['','10',
    '11 - 20',
    '21 - 30',
    '31 - 40',
    '41 - 50',
    '51 - 60',
    '61 - 70',
    'other'
    ],
    'max_spend_values':[0,1,2,3,4,5,6,7,8]
})
max_spend_scale
#max_spend_scale.to_csv('max_spend_scale.csv',index=False)


# how find shop numeric scale
how_find_scale = pd.DataFrame({
    'how_find':['','Walk in',
    'Referral',
    'Yelp',
    'Google',
    'Facebook',
    'other'
               ],
    'how_find_values':[0,1,2,3,4,5,6]
})
how_find_scale
#how_find_scale.to_csv('how_find_scale.csv',index=False)


# leave a review numeric scale
review_scale = pd.DataFrame({
    'leave_review':['Yes','No','Sometimes'],
    'review_values':[1,2,3]
})
#review_scale.to_csv('review_scale.csv',index=False)
In [8]:
gender_scale
Out[8]:
gender gender_values
0 Male 1
1 Female 0
2 Other 2
3 3
In [9]:
race_scale
Out[9]:
race race_values
0 Asian / Pacific Islander 1
1 Black or African American 2
2 Hispanic or Latino 3
3 White 4
4 Other 5
5 0
In [10]:
income_scale
Out[10]:
income income_values
0 0
1 Less than $25,000 1
2 $25,000 to $34,999 2
3 $35,000 to $49,999 3
4 $50,000 to $74,999 4
5 $75,000 to $99,999 5
6 $100,000 to $149,999 6
7 $150,000 to $199,999 7
8 $200,000 or more 8
In [11]:
days_last_scale
Out[11]:
days_last_haircut days_last_values
0 0
1 1 week 1
2 2 weeks 2
3 3 weeks 3
4 1 month 4
5 2 months ago 5
6 6 weeks 6
7 3+ months 7
8 other 8
In [12]:
time_between_scale
Out[12]:
time_between_haircut time_between_values
0 0
1 1 week 1
2 2 weeks 2
3 3 weeks 3
4 1 month 4
5 2 months ago 5
6 6 weeks 6
7 3+ months 7
8 other 8
In [13]:
products_scale
Out[13]:
products products_values
0 Yes 1
1 No 0
In [14]:
spend_scale
Out[14]:
did_spend spend_values
0 0
1 10 1
2 11 - 20 2
3 21 - 30 3
4 31 - 40 4
5 40 + 5
6 other 6
In [15]:
max_spend_scale
Out[15]:
max_spend max_spend_values
0 0
1 10 1
2 11 - 20 2
3 21 - 30 3
4 31 - 40 4
5 41 - 50 5
6 51 - 60 6
7 61 - 70 7
8 other 8
In [16]:
how_find_scale
Out[16]:
how_find how_find_values
0 0
1 Walk in 1
2 Referral 2
3 Yelp 3
4 Google 4
5 Facebook 5
6 other 6
In [17]:
review_scale
Out[17]:
leave_review review_values
0 Yes 1
1 No 2
2 Sometimes 3
In [18]:
def numeric_survey(csv):
    
    surveyimport = csv
    # resave dataframe with relative columns
    googledf = surveyimport[['What is your work zip code?','What is your home zip code?','Gender','Age','Race','Income','How long since your last haircut?','How long between typical haircuts?','Did you buy any products after your haircut?','How much did you spend on your haircut?','What is the most you would spend on a haircut?','How did you find your current barber?','Do you leave reviews on Yelp, Google, Etc? ','How important is Price ( 1 = not important, 5 = very important)','How important is Convenience( 1 = not important, 5 = very important)','How important is Atmosphere( 1 = not important, 5 = very important)','How important are Additional Services ( 1 = not important, 5 = very important)']]
    
    # rename columns to match numeric scale legends keys
    googledf.columns = ['work_zip','home_zip','gender','age','race','income','days_last_haircut','time_between_haircut','products','did_spend','max_spend','how_find','leave_review','price','convenient','atmosphere','amenities']
    
    # merge legend tables with current dataframe to have all items in numeric terms for prediction model
    legendDF = googledf.merge(gender_scale, on='gender',how='left')
    legendDF2 = legendDF.merge(race_scale, on='race',how='left')
    legendDF3 = legendDF2.merge(income_scale, on='income', how='left')
    legendDF4 = legendDF3.merge(days_last_scale, on='days_last_haircut',how='left')
    legendDF5 = legendDF4.merge(time_between_scale, on='time_between_haircut',how='left')
    legendDF6 = legendDF5.merge(products_scale, on='products',how='left')
    legendDF7 = legendDF6.merge(spend_scale, on='did_spend',how='left')
    legendDF8 = legendDF7.merge(max_spend_scale, on='max_spend',how='left')
    legendDF9 = legendDF8.merge(how_find_scale, on='how_find',how='left')
    legendDF_final = legendDF9.merge(review_scale, on='leave_review',how='left')
    
    # create dataframe with numeric scale values
    legendDF_values = legendDF_final[['work_zip','home_zip','gender_values','age','race_values','income_values','days_last_values','time_between_values','products_values','spend_values','max_spend_values','how_find_values','review_values','price','convenient','atmosphere','amenities']]
    
    # fill nan values with 0 for the scale/legend on non-legend responses
    surveyFinal = legendDF_values.fillna(0)
    
    # import time dependency fro today's date timestamp
    import datetime as dt
    date = dt.datetime.today().strftime("%m%d%Y")

    surveyFinal.to_csv(f'../data/survey{date}.csv', index=False)
    return surveyFinal.head()
In [19]:
numeric_survey(googlecsv)
Out[19]:
work_zip home_zip gender_values age race_values income_values days_last_values time_between_values products_values spend_values max_spend_values how_find_values review_values price convenient atmosphere amenities
0 92614 92614 1 38.0 1.0 1.0 1.0 3.0 0.0 5.0 5.0 3.0 1.0 4 4 4 1
1 92660 92677 1 34.0 4.0 5.0 6.0 7.0 0.0 5.0 7.0 2.0 2.0 5 3 3 1
2 92612 92602 0 35.0 4.0 4.0 0.0 7.0 1.0 5.0 0.0 3.0 2.0 5 5 4 4
3 92620 92780 1 35.0 1.0 4.0 1.0 3.0 1.0 5.0 6.0 3.0 3.0 3 5 4 4
4 97205 97205 1 38.0 4.0 4.0 1.0 6.0 0.0 5.0 5.0 4.0 3.0 4 2 3 2