This is a capstone project for Python Data Products for Predictive Analytics Specialization.
Project will include 4 tasks:
The dataset of interest is the Amazon Customer Reviews Dataset on Home Entertainment Items, which can be found in Amazon Customer Reviews Library site.
Below is the data dictionary for this dataset:
import gzip
from collections import defaultdict
import random
import numpy
import scipy.optimize
import string
from sklearn import linear_model
from nltk.stem.porter import PorterStemmer # Stemming
Take care of int casting the votes and rating. And set the verified purchase column to Boolean type instead of string type
path = "amazon_reviews_us_Home_Entertainment_v1_00.tsv.gz"
f = gzip.open(path, 'rt', encoding = "utf8")
header = f.readline()
header = header.strip().split('\t')
dataset = []
for line in f:
fields = line.strip().split('\t')
d = dict(zip(header, fields))
d['star_rating'] = int(d['star_rating'])
d['helpful_votes'] = int(d['helpful_votes'])
d['total_votes'] = int(d['total_votes'])
d['verified_purchase'] = d['verified_purchase']=='Y'
dataset.append(d)
# below shows what a typical entry would look like
dataset[0]
# shuffling data
random.shuffle(dataset)
dataset[0]
Have Training be the first 80%, and testing be the remaining 20%.
N = len(dataset)
trainingSet = dataset[:4*N//5]
testSet = dataset[4*N//5:]
print(len(trainingSet), len(testSet))
Based on the Training Set, below are some questions we can answer:
# functions
def average_rating(dataset):
# initilizing
rating = 0
# looping through and sum every 'star_rating' element
for i in range(len(dataset)):
rating = dataset[i]['star_rating'] + rating
#calculating the average rating
avg_rating = rating/len(dataset)
return avg_rating
def verified_purchases_ct(dataset):
count = 0
for i in range(len(dataset)):
if dataset[i]['verified_purchase']:
count +=1
pct = count/len(dataset)*100
return round(pct,2)
def total_usersOrItems(dataset, id_string):
unique = set(dataset[i][id_string] for i in range(len(dataset)))
return len(unique)
def five_star_ct(dataset):
five_star = 0
for i in range(len(dataset)):
if (dataset[i]['star_rating']==5):
five_star +=1
continue
pct_fiveStar = five_star/len(dataset)*100
return round(pct_fiveStar,2)
print("1. ", average_rating(trainingSet),
"\n2. ", verified_purchases_ct(trainingSet),"%"
"\n3. ", total_usersOrItems(trainingSet, 'customer_id'),
"\n4. ", total_usersOrItems(trainingSet, 'product_id'),
"\n5. ", five_star_ct(trainingSet),"%")
def feat_eng(dataset):
count = 0
for d in dataset:
for i in d['review_body']:
# note here we are including punctuations and whitespaces
count+=1 if len(i)!=0 else 0
d['len_review'] = count
count = 0
d['len_review'] = int(d['len_review'])
return dataset
# Return List of Feature Vectors
def feature_vector (data):
features = []
for d in data:
star_rating = d['star_rating']
len_review = d['len_review']
features.append([1,star_rating, len_review])
return features
trainingSet = feat_eng(trainingSet)
testSet = feat_eng(testSet)
Fit the model
features_train = feature_vector(trainingSet)
features_test = feature_vector(testSet)
label_train = [d['verified_purchase'] for d in trainingSet]
label_test = [d['verified_purchase'] for d in testSet]
model = linear_model.LogisticRegression()
model.fit(features_train, label_train)
print(model.score(features_train,label_train))
label_pred_train = model.predict(features_train)
label_pred_test = model.predict(features_test)
correct_train = label_pred_train == label_train
accuracy_train = sum(correct_train)/len(correct_train)
correct = label_pred_test == label_test
accuracy = sum(correct)/len(correct)
print("Training accuracy of the model = ", accuracy_train)
print("Testing accuracy of the model = ", accuracy)
TP = sum([(p and l) for (p,l) in zip(label_pred_test, label_test)])
FP = sum([(p and not l) for (p,l) in zip(label_pred_test, label_test)])
TN = sum([(not p and not l) for (p,l) in zip(label_pred_test, label_test)])
FN = sum([(not p and l) for (p,l) in zip(label_pred_test, label_test)])
print("TP = " + str(TP))
print("FP = " + str(FP))
print("TN = " + str(TN))
print("FN = " + str(FN))
BER = 0.5*(FP/(TN+FP) + FN/(FN+TP))
print("Balanced Error Rate = " + str(BER))
We are going to work with a smaller Sample Set here, as stemming on the normal training set will take a very long time.
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
wordCountStem = defaultdict(int)
stemmer = PorterStemmer() #use stemmer.stem(stuff)
sampleSet = trainingSet[:2*len(trainingSet)//10]
len(sampleSet)
def word_ct (dataset, string):
if string == 'reg':
for d in dataset:
r = "".join([c for c in d["review_body"].lower() if not c in punctuation])
for w in r.split():
wordCount[w] += 1
return wordCount
elif string == 'stem':
for d in dataset:
r = "".join([c for c in d["review_body"].lower() if not c in punctuation])
for w in r.split():
w = stemmer.stem(w)
wordCountStem[w] += 1
return wordCountStem
wordCount = word_ct(sampleSet, 'reg')
wordCountStem = word_ct(sampleSet, 'stem')
print("#1. Number of unique words without stemming: ", len(wordCount))
print("#2. Number of unique words with stemming: ", len(wordCountStem))
def feature_reg(datum):
feat = [0]*len(words)
review = d['review_body'].lower()
r = ''.join([c for c in review if not c in punctuation])
for w in r.split():
if w in wordSet:
feat[wordId[w]] += 1
return feat
def MSE(predictions, labels):
differences = [(x-y)**2 for x,y in zip(predictions,labels)]
return sum(differences) / len(differences)
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
#Note: increasing the size of the dictionary may require a lot of memory
words = [x[1] for x in counts[:1000]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)
X_train = [feature_reg(d) for d in sampleSet]
y_train = [d["star_rating"] for d in sampleSet] # the y_reg vector
X_test = [feature_reg(d) for d in testSet]
y_test = [d["star_rating"] for d in testSet]
rg_model = linear_model.Ridge(alpha = 1.0, fit_intercept = True)
rg_model.fit(X_train,y_train)
# Predicting the star_rating for testSet based on X_test, which are the feature variables
X_pred = rg_model.predict(X_test)
# Below is the Logistic Regression Model
lg_model = linear_model.LogisticRegression()
lg_model.fit(X_train,y_train)
lg_X_pred = lg_model.predict(X_test)
print('MSE score of Logistic Regression Model: ', MSE(lg_X_pred, y_test))
print('MSE score of Ridge Regression Model: ', MSE(X_pred, y_test))
Here the Ridge Regression Model performs better as it gives a lower MSE value.
Using knowledge of simple latent factor-based recommender systems to make predictions. Then evaluating the performance of the predictions.
Back to using the trainingSet.
#Create and fill our default dictionaries for our dataset
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
for d in trainingSet:
user,item = d['customer_id'], d['product_id']
reviewsPerUser[user].append(d)
reviewsPerItem[item].append(d)
#Create two dictionaries that will be filled with our rating prediction values
userBiases = defaultdict(float)
itemBiases = defaultdict(float)
#Getting the respective lengths of our dataset and dictionaries
N = len(trainingSet)
nUsers = len(reviewsPerUser)
nItems = len(reviewsPerItem)
#Getting the list of keys
users = list(reviewsPerUser.keys())
items = list(reviewsPerItem.keys())
### You will need to use this list
y_rec = [d['star_rating'] for d in trainingSet]
#1
avg_mean = average_rating(trainingSet)
ratingMean = [avg_mean for d in trainingSet]
#2
MSE(ratingMean, y_rec)
Here we are defining the functions needed to optimize the MSE value.
alpha = avg_mean
def prediction(user, item):
return alpha + userBiases[user] + itemBiases[item]
def unpack(theta):
global alpha
global userBiases
global itemBiases
alpha = theta[0]
userBiases = dict(zip(users, theta[1:nUsers+1]))
itemBiases = dict(zip(items, theta[1+nUsers:]))
def cost(theta, labels, lamb):
unpack(theta)
predictions = [prediction(d['customer_id'], d['product_id']) for d in trainingSet]
cost = MSE(predictions, labels)
print("MSE = " + str(cost))
for u in userBiases:
cost += lamb*userBiases[u]**2
for i in itemBiases:
cost += lamb*itemBiases[i]**2
return cost
def derivative(theta, labels, lamb):
unpack(theta)
N = len(trainingSet)
dalpha = 0
dUserBiases = defaultdict(float)
dItemBiases = defaultdict(float)
for d in trainingSet:
u,i = d['customer_id'], d['product_id']
pred = prediction(u, i)
diff = pred - d['star_rating']
dalpha += 2/N*diff
dUserBiases[u] += 2/N*diff
dItemBiases[i] += 2/N*diff
for u in userBiases:
dUserBiases[u] += 2*lamb*userBiases[u]
for i in itemBiases:
dItemBiases[i] += 2*lamb*itemBiases[i]
dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
return numpy.array(dtheta)
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + [0.0]*(nUsers+nItems),
derivative, args = (y_rec, 0.001))
Notice the optimized MSE converges to roughly 2.02527
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
itemTitle = {}
for d in trainingSet:
user, item = d['customer_id'], d['product_id']
usersPerItem[item].add(user)
itemsPerUser[user].add(item)
itemTitle[item] = d['product_title']
def Jaccard(s1, s2):
numer = len(s1.intersection(s2))
denom = len(s1.union(s2))
return numer / denom
def mostSimilar(iD, m): #n is the entry index
similarities = [] #m is the number of entries
users = usersPerItem[iD]
for i2 in usersPerItem:
if i2 == iD: continue
sim = Jaccard(users, usersPerItem[i2])
similarities.append((sim,i2))
similarities.sort(reverse=True)
return similarities[:m]
query = trainingSet[10]['product_id']
print("Item id: ", query)
print("Corresponding item name: ", itemTitle[query])
# showing the top 10 similar item IDs
# and the similarity measure in descending order
mostSimilar(query,10)
# Below gives the name of these item_id
[(x[1], itemTitle[x[1]]) for x in mostSimilar(query,10)]