import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import math
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
# from sklearn.externals import joblib
import scipy.sparse
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import warnings; warnings.simplefilter('ignore')
%matplotlib inline

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

electronics_data = pd.read_csv("ratings_Electronics (1).csv",names=['userId', 'productId','Rating','timestamp'])

electronics_data.head()

electronics_data.shape

(7824482, 4)

eletronics_data = electronics_data.iloc[:1048576,0:]

eletronics_data.dtypes

userId        object
productId     object
Rating       float64
timestamp      int64
dtype: object

eletronics_data.describe()['Rating'].T

count    1.048576e+06
mean     3.973380e+00
std      1.399329e+00
min      1.000000e+00
25%      3.000000e+00
50%      5.000000e+00
75%      5.000000e+00
max      5.000000e+00
Name: Rating, dtype: float64

print("max of rating : %d" %(eletronics_data.Rating.max()))
print("min of rating : %d" %(eletronics_data.Rating.min()))

max of rating : 5
min of rating : 1

Missing values 수¶

print('Number of missing values across columns: \n',electronics_data.isnull().sum())

Number of missing values across columns: 
 userId       0
productId    0
Rating       0
timestamp    0
dtype: int64

Ratings¶

with sns.axes_style('white'):
    g = sns.factorplot("Rating", data  = eletronics_data, aspect = 2.0, kind = 'count')
    g.set_ylabels("Total number of ratings")

<seaborn.axisgrid.FacetGrid at 0x7faafbe6f130>

print("Total data")
print("-"*50)
print("\nTotal num of ratings :", eletronics_data.shape[0])
print("Total num of Users :", len(np.unique(eletronics_data.userId)))
print("Total num of products: ", len(np.unique(eletronics_data.productId)))

Total data
--------------------------------------------------

Total num of ratings : 1048576
Total num of Users : 786330
Total num of products:  61894

eletronics_data.drop(['timestamp'],axis = 1, inplace = True)

eletronics_data.head()

Ratings 분석¶

num_of_rated_products_per_user = eletronics_data.groupby(by = 'userId')['Rating'].count().sort_values(ascending=False)
num_of_rated_products_per_user.head()

userId
A5JLAU2ARJ0BO     412
A231WM2Z2JL0U3    249
A25HBO5V8S8SEA    164
A6FIAB28IS79      146
AT6CZDCP4TRGA     128
Name: Rating, dtype: int64

num_of_rated_products_per_user.describe()

count    786330.000000
mean          1.333506
std           1.385612
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max         412.000000
Name: Rating, dtype: float64

quantiles = num_of_rated_products_per_user.quantile(np.arange(0,1.01,0.01), interpolation='higher')

quantiles

0.00      1
0.01      1
0.02      1
0.03      1
0.04      1
       ... 
0.96      3
0.97      4
0.98      4
0.99      6
1.00    412
Name: Rating, Length: 101, dtype: int64

plt.figure(figsize=(10,10))
plt.title("Quantiles and their Values")
quantiles.plot()
# quantiles with 0.05 difference
plt.scatter(x=quantiles.index[::5], y=quantiles.values[::5], c='orange', label="quantiles with 0.05 intervals")
# quantiles with 0.25 difference
plt.scatter(x=quantiles.index[::25], y=quantiles.values[::25], c='m', label = "quantiles with 0.25 intervals")
plt.ylabel('No of ratings by user')
plt.xlabel('Value at the quantile')
plt.legend(loc='best')
plt.show()

<Figure size 720x720 with 0 Axes>

Text(0.5, 1.0, 'Quantiles and their Values')

<AxesSubplot:title={'center':'Quantiles and their Values'}>

<matplotlib.collections.PathCollection at 0x7faab1f59700>

<matplotlib.collections.PathCollection at 0x7faab6842c70>

Text(0, 0.5, 'No of ratings by user')

Text(0.5, 0, 'Value at the quantile')

<matplotlib.legend.Legend at 0x7faab1f59af0>

print('\n No of rated product more than 50 per user : {}\n'.format(sum(num_of_rated_products_per_user >= 50)) )

 No of rated product more than 50 per user : 38

Popularity Based Recommendation¶

평가가 50번이상 된 상품들을 가져옴

new_df=electronics_data.groupby("productId").filter(lambda x:x['Rating'].count() >=50)

new_df.head()

new_df.shape

(5374313, 4)

num_of_ratings_per_product = new_df.groupby(by='productId')['Rating'].count().sort_values(ascending=False)

fig = plt.figure(figsize=plt.figaspect(.5))
ax = plt.gca()
plt.plot(num_of_ratings_per_product.values)
plt.title('# RATINGS per Product')
plt.xlabel('Product')
plt.ylabel('No of ratings per product')
ax.set_xticklabels([])

plt.show()

[<matplotlib.lines.Line2D at 0x7faab1fc3100>]

Text(0.5, 1.0, '# RATINGS per Product')

Text(0.5, 0, 'Product')

Text(0, 0.5, 'No of ratings per product')

[Text(-5000.0, 0, ''),
 Text(0.0, 0, ''),
 Text(5000.0, 0, ''),
 Text(10000.0, 0, ''),
 Text(15000.0, 0, ''),
 Text(20000.0, 0, ''),
 Text(25000.0, 0, ''),
 Text(30000.0, 0, '')]

평가들을 상품 아이디로 묶고 평점의 평균을 냄

new_df.groupby('productId')['Rating'].mean().head()

productId
0972683275    4.470980
1400501466    3.560000
1400501520    4.243902
1400501776    3.884892
1400532620    3.684211
Name: Rating, dtype: float64

new_df.groupby('productId')['Rating'].mean().sort_values(ascending=False).head()

productId
B002E6R7NG    4.980392
B004I763AW    4.966667
B003J9QQWU    4.964286
B0043ZLFXE    4.955556
B000TMFYBO    4.953125
Name: Rating, dtype: float64

평가된 수를 기준으로 내림차순 정렬

new_df.groupby('productId')['Rating'].count().sort_values(ascending=False).head()

productId
B0074BW614    18244
B00DR0PDNE    16454
B007WTAJTO    14172
B0019EHU8G    12285
B006GWO5WK    12226
Name: Rating, dtype: int64

ratings_mean_count = pd.DataFrame(new_df.groupby('productId')['Rating'].mean())

ratings_mean_count.head()

ratings_mean_count['rating_counts'] = pd.DataFrame(new_df.groupby('productId')['Rating'].count())

ratings_mean_count.head()

ratings_mean_count['rating_counts'].max()

18244

plt.figure(figsize=(6,6))
plt.rcParams['patch.force_edgecolor']=True
ratings_mean_count['rating_counts'].hist(bins=50)

<Figure size 432x432 with 0 Axes>

<AxesSubplot:>

plt.figure(figsize=(8,6))
plt.rcParams['patch.force_edgecolor']=True
ratings_mean_count['Rating'].hist(bins=50)

<Figure size 576x432 with 0 Axes>

<AxesSubplot:>

plt.figure(figsize=(8,6))
plt.rcParams['patch.force_edgecolor'] = True
sns.jointplot(x='Rating', y='rating_counts', data=ratings_mean_count, alpha=0.4)

<Figure size 576x432 with 0 Axes>

<seaborn.axisgrid.JointGrid at 0x7faab1fb5430>

<Figure size 576x432 with 0 Axes>

popular_products = pd.DataFrame(new_df.groupby('productId')['Rating'].count())
most_popular = popular_products.sort_values("Rating", ascending=False)
most_popular.head(30).plot(kind = "bar")

<AxesSubplot:xlabel='productId'>

Collaborative Filtering 이용¶

# Surprise 패키지를 이용하기 위해 설치

!pip3 install surprise

Requirement already satisfied: surprise in /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages (0.1)
Requirement already satisfied: scikit-surprise in /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages (from surprise) (1.1.1)
Requirement already satisfied: scipy>=1.0.0 in /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages (from scikit-surprise->surprise) (1.4.1)
Requirement already satisfied: joblib>=0.11 in /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages (from scikit-surprise->surprise) (0.16.0)
Requirement already satisfied: six>=1.10.0 in /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages (from scikit-surprise->surprise) (1.15.0)
Requirement already satisfied: numpy>=1.11.2 in /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages (from scikit-surprise->surprise) (1.19.0)

from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
import os
from surprise.model_selection import train_test_split

1) Data 가져오기¶

new_df.drop(['timestamp'],axis = 1, inplace = True)

reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(new_df,reader)

2) Trainset, Testset으로 나누기¶

trainset, testset = train_test_split(data, test_size = 0.3, random_state=10)

3) KNN을 이용 trainset에 파라미터 맞추기¶

algo = KNNWithMeans(k=5, sim_options = {'name': 'pearson_baseline', 'user_based':False})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7faa6c2c6dc0>

4) Test accuracy 측정¶

test_pred = algo.test(testset)

print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

Item-based Model : Test Set
RMSE: 1.3335

1.3334560899557941

Model-based collaborative filtering system¶

# 대량의 상품, 많은 사용자들에게 추천 가능, sparse한 matrices에도 사용가능

new_df1 = new_df.head(10000)
ratings_matrix = new_df1.pivot_table(values ='Rating', index ='userId', columns ='productId',fill_value =0)
ratings_matrix.head()

ratings_matrix.shape

(9832, 76)

X = ratings_matrix.T
X.head()

X.shape

(76, 9832)

X.index

Index(['0972683275', '1400501466', '1400501520', '1400501776', '1400532620',
       '1400532655', '140053271X', '1400532736', '1400599997', '1400698987',
       '3744295508', '6301977173', '7214047977', '8862935293', '9573212919',
       '9575871979', '9625993428', '9888002198', '9966694544', '9983891212',
       '9984984354', '9985511476', 'B000001OM4', 'B000001OM5', 'B000001OMI',
       'B000001ON6', 'B00000DM9W', 'B00000IGBF', 'B00000J05A', 'B00000J061',
       'B00000J08Q', 'B00000J0D2', 'B00000J0D5', 'B00000J0D8', 'B00000J1EJ',
       'B00000J1EP', 'B00000J1EQ', 'B00000J1F3', 'B00000J1GA', 'B00000J1QK',
       'B00000J1QR', 'B00000J1SC', 'B00000J1TX', 'B00000J1U8', 'B00000J1UQ',
       'B00000J1V3', 'B00000J1V5', 'B00000J3NF', 'B00000J3Q7', 'B00000J3UJ',
       'B00000J434', 'B00000J4EY', 'B00000J4FS', 'B00000J4GE', 'B00000J6WY',
       'B00000JBAT', 'B00000JBHP', 'B00000JBPB', 'B00000JCT8', 'B00000JCTO',
       'B00000JD34', 'B00000JDF5', 'B00000JDF6', 'B00000JDHV', 'B00000JFE3',
       'B00000JFIF', 'B00000JFMK', 'B00000JHWX', 'B00000JI4F', 'B00000JII6',
       'B00000JMUG', 'B00000JPPI', 'B00000JSGF', 'B00000JYLO', 'B00000JYWQ',
       'B00000K135'],
      dtype='object', name='productId')

X1 = X

from sklearn.decomposition import TruncatedSVD
SVD = TruncatedSVD(n_components=10)
decomposed_matrix = SVD.fit_transform(X)
decomposed_matrix.shape

(76, 10)

decomposed_matrix

array([[ 1.48104508e+02, -1.54627227e-01,  2.45283362e-02,
        -5.08568844e-04, -1.34180041e-01, -3.20183398e-06,
         4.61038303e-02, -1.87763195e-04,  1.51349202e-03,
        -8.41342654e-02],
       [ 7.06973958e-04,  1.28320026e+00,  1.48206046e+00,
         1.82292221e-01,  2.60801854e-02,  5.43852155e-02,
         2.84154218e-02,  1.37114661e-01,  6.12145972e+01,
        -5.20875091e-02],
       [ 1.00121222e-06,  9.54270887e-02,  3.14262401e-01,
        -3.28359079e-04,  2.08422270e-03, -1.29660115e-02,
         9.62856527e-03,  1.46758477e-02,  1.55697876e+00,
        -1.65364410e-01],
       [ 6.46554859e-04,  7.75262173e-01,  4.93737283e-01,
        -2.82380718e-02,  5.73363348e-02,  1.46510933e-01,
         6.44724292e-02,  4.51476761e-01,  3.08227843e+00,
        -5.15067361e-01],
       [ 4.29496792e-04,  8.40511666e-01,  1.08540043e+00,
         6.84425110e-02, -6.51938079e-02, -9.21274356e-02,
        -2.03446581e-02, -1.98261525e-01,  1.34150268e+00,
         2.14354555e+00],
       [ 2.62249281e-01,  8.72497938e+01, -1.42204743e+01,
        -7.81601819e-03, -9.51834200e-05, -2.40129391e-03,
        -7.00002710e-03, -3.34167631e-03, -7.35315872e-01,
        -6.97014961e-03],
       [ 1.02453257e-03,  1.45159414e+01,  8.52277292e+01,
        -5.69835701e-02, -2.54938911e-02, -1.59209007e-04,
        -1.25991438e-02, -1.80730854e-03, -1.25607000e+00,
        -3.11240957e-02],
       [-1.96805301e-06,  6.98581460e-02,  4.07139396e-01,
         2.42412153e-03, -5.16345483e-03,  5.41485793e-03,
        -3.85062831e-03,  6.23454884e-03,  6.15487787e-01,
         1.34655416e-01],
       [ 7.53083775e-04,  7.59591752e-01, -1.14500776e-01,
        -5.74409237e-02,  8.81234145e-02,  1.70513271e-01,
         5.35339894e-02,  5.04657940e-02, -1.82856415e+00,
        -6.00317981e-01],
       [ 2.46821662e-04,  5.43465253e-01,  7.46061169e-01,
        -2.30652452e-02,  1.46389645e-01, -1.28486487e-02,
         4.20706946e-01, -9.67573226e-02,  2.81968786e+00,
         1.03728699e+00],
       [-6.07252699e-07,  1.17717451e-03, -4.90190513e-05,
        -2.30474180e-04,  2.98920936e-04,  1.16104720e-03,
         4.13250720e-03,  6.27431804e-03, -4.99792046e-02,
        -1.01083323e-02],
       [ 3.77534126e-07,  1.69885161e-03,  2.22746215e-03,
         2.12884051e-04,  1.16440213e-03,  5.54841828e-03,
         1.93000506e-03,  2.27384828e-02,  5.95620310e-02,
         2.64962582e-02],
       [ 3.94211951e-05,  4.08878948e-02,  2.87649540e-02,
         1.85666440e-02,  1.04513483e-02,  2.20536697e-02,
         4.36449481e-02,  1.14327682e-01, -8.62955947e-01,
         1.50078814e+00],
       [ 1.27234801e-05, -5.29914580e-04, -2.59462302e-03,
        -3.50534141e-03,  7.00438267e-03,  8.47726165e-03,
         2.83704237e-03, -1.15278422e-02, -2.03512216e-01,
         3.05833566e-02],
       [ 5.90306125e-05,  1.77047089e-04,  8.95567730e-03,
        -3.82815493e-02,  5.92078448e-02,  1.23951511e-01,
         3.94213941e-02,  4.15740197e-01, -4.21901641e+00,
         5.86943295e-01],
       [-2.09990428e-05,  1.63358006e-02,  2.47623451e-02,
         3.95130834e-03, -2.76371930e-03,  4.65859287e-02,
         1.71041855e-02,  1.65022493e-01, -3.98840456e-01,
         5.29601110e-01],
       [-3.30718789e-05, -8.01661165e-03,  1.27810491e-03,
         5.16392844e-03, -1.43488954e-02, -9.83284551e-03,
        -6.14182055e-03,  7.70195154e-03,  3.19363801e-01,
         6.35122577e-02],
       [ 1.71849476e-06, -1.96498578e-04,  4.75306147e-04,
        -5.84857445e-05, -9.99414508e-05,  1.51758286e-03,
         9.39318985e-04,  7.98751303e-04, -3.99680505e-02,
         2.39787563e-02],
       [ 3.10694970e-06,  8.10470964e-06,  8.74815172e-05,
        -1.94704197e-03,  1.18895558e-03,  2.37155395e-03,
        -3.28488946e-03,  7.85559674e-03,  5.18029900e-03,
         1.79003542e-02],
       [ 2.02704689e-01, -9.17846241e-03, -6.77717193e-03,
        -5.82942015e-03,  2.13631503e-03, -4.94108822e-04,
         2.53098093e-03,  5.11224494e-02,  5.09560781e-02,
         6.03303327e+01],
       [ 1.11784268e-06,  5.41686295e-04,  6.11986471e-04,
         1.01673452e-05,  2.71340706e-04,  1.10811822e-03,
        -3.94543197e-04,  6.35591465e-03,  3.71069392e-04,
         1.84549502e-02],
       [ 5.79588502e-06, -2.12998463e-03, -1.52101353e-03,
        -2.04178241e-03, -8.59003044e-04, -5.87765286e-03,
        -6.04231474e-03,  1.21302100e-02,  8.23524954e-02,
         1.17021438e-01],
       [-1.76462977e-05, -7.09694200e-03, -6.39526909e-03,
         3.34839648e-03, -1.05498468e-02, -1.86310217e-02,
        -1.41234906e-02, -5.80240769e-02,  3.27412509e-01,
         2.32688798e-02],
       [ 1.86452484e-01,  2.31451086e-02,  1.99641784e-02,
         1.59104518e-02,  3.83500665e-01,  5.39390649e-02,
        -1.82657746e-01,  9.46017126e-02, -4.57951658e-01,
         1.26364663e+00],
       [-4.44815102e-07,  8.53253065e-05,  8.86288175e-04,
         6.49725484e-03,  4.38266181e-06,  1.43147670e-03,
        -3.61928780e-04,  4.42759459e-01,  1.39259902e-02,
        -1.94370547e-03],
       [ 4.16971084e-04,  5.05664153e-03,  5.55559255e-03,
        -6.50176553e-03,  1.22824242e-02,  6.26203320e-01,
         1.66460990e-02, -5.74859765e-02, -5.12336794e-01,
         1.32247512e+00],
       [ 1.28282952e-06,  1.51713988e-02,  5.19307155e-02,
         8.50818431e+01, -3.23688837e-01, -3.01037415e-02,
        -5.88252441e-04, -1.37598303e+00, -1.35185996e-01,
         2.32898932e-03],
       [ 3.38126242e-06,  4.14991913e-04, -1.84416303e-04,
        -1.38651233e-04,  1.12146086e-03,  3.24595864e-03,
         1.32303572e-03,  3.76945146e-03, -5.44394874e-02,
        -7.97488258e-03],
       [ 9.20608256e-07, -3.50101486e-04,  1.38293839e-05,
        -6.46042314e-04,  1.87794744e-03,  7.25460765e-04,
         5.10355724e-03, -1.35867520e-03, -2.06540636e-02,
         2.90715985e-03],
       [-2.01249631e-06, -1.46286997e-04,  1.33909039e-04,
         1.70252952e+00, -1.20337259e-02,  3.54197832e-01,
        -5.50902650e-03,  6.87943862e+01, -9.12273916e-02,
        -2.68850395e-02],
       [ 1.51793185e-06,  2.78110728e-04,  2.45314304e-04,
        -8.51525269e-04,  1.20373839e-03,  1.32581962e-03,
         6.91162508e-04,  2.26031356e-03, -9.66619064e-03,
         2.29724055e-02],
       [-3.19775545e-07,  1.28871656e-04,  4.98897897e-04,
         1.24791456e-04,  2.48544731e-04,  9.64119053e-04,
         6.00090954e-04,  6.16795561e-04, -1.03158828e-02,
         1.33205189e-03],
       [-3.44820661e-07, -3.05234540e-04, -9.21009183e-05,
         1.14807394e-04, -2.97720269e-04, -4.35691039e-04,
        -7.71910848e-04,  4.41194730e-04,  1.08052855e-02,
         4.99200638e-03],
       [ 8.32126308e-09,  1.81034453e-06, -4.28985676e-07,
        -9.29644706e-07,  5.97588352e-06,  7.15867590e-06,
        -8.48533281e-06,  3.76887638e-05, -1.04646979e-04,
        -1.23825283e-04],
       [ 3.83436215e-08, -2.37497031e-04, -1.34147016e-04,
        -2.22239918e-04,  2.39187905e-04, -8.74062487e-04,
        -4.11872645e-04, -8.99251960e-05,  1.02358192e-02,
        -1.90811912e-02],
       [ 4.83524715e-07,  1.29375820e-03,  6.94043844e-04,
        -4.86023283e-03, -4.92401921e-04,  6.57163948e-03,
        -3.51964404e-03,  5.44925090e-02,  4.42251024e-03,
         5.26040420e-02],
       [ 2.01329562e-05, -6.45630355e-03, -1.16226138e-02,
        -7.79416931e-03,  1.07821562e-02,  2.86087218e-02,
        -2.98724980e-03,  9.14405437e-02, -9.75770889e-02,
        -6.12176169e-01],
       [ 1.63301024e-06, -5.12785214e-05, -3.14815786e-04,
        -9.23592178e-04,  3.70656373e-04,  8.22241229e-03,
         5.95654166e-04,  3.11248179e-03, -9.32226126e-03,
         1.32046343e-02],
       [ 2.34214982e-04, -1.11095480e-04,  3.76695899e-04,
         6.74336055e-04,  2.42771619e-01,  2.20940492e-03,
        -1.10723575e-01, -1.62624414e-03, -7.65313102e-03,
         1.23782315e-02],
       [-1.00552968e-04, -2.54233462e-02, -8.65625312e-03,
         1.51800869e-02, -4.89382817e-02, -7.77939254e-02,
        -4.63969882e-03,  1.78140980e-02,  6.90712432e-01,
        -1.29797367e+00],
       [ 2.24049884e-04,  3.83974657e-04,  2.71729758e-03,
         2.99380055e-03,  1.09298282e-01,  2.28603536e-03,
         3.05975946e-01,  5.49438494e-03,  4.61773843e-02,
         8.02183976e-02],
       [-8.55592163e-06,  1.61252409e-02, -7.88053876e-03,
         5.32535848e-02, -7.03850457e-02, -1.08173380e-01,
        -1.97062371e-02, -5.88501217e-01,  1.52111419e+00,
         1.05219136e+00],
       [-1.11361568e-04, -9.50898116e-03,  3.56737091e-02,
        -3.43493873e-02,  3.28975019e-02,  3.09060558e-02,
         1.27627911e-02,  5.00510549e-01, -6.49358261e-01,
        -3.12351981e-01],
       [ 3.79474142e-06,  1.02960036e-03,  2.55916754e-03,
        -1.47442984e-03,  1.40138619e-03,  5.70525768e-03,
        -1.23781318e-03,  4.13003575e-03, -1.26688619e-02,
         6.38244396e-02],
       [ 6.76517387e-05, -3.78335229e-02,  2.20538999e-03,
        -5.74602006e-02,  5.68802113e-02,  5.07885360e-02,
         1.34128264e-02, -1.39538224e-01, -1.24852854e+00,
        -2.58315112e-01],
       [ 2.56721100e-06,  6.06436939e-03,  1.49331382e-02,
        -4.82933192e-04, -4.11692707e-03,  2.23773452e-02,
         6.39896649e-02, -1.21428745e-01, -5.33785527e-01,
        -4.22509655e-01],
       [ 2.49014914e-01, -2.47326120e-04,  1.93256880e-02,
         2.99288845e-01,  7.85338829e+01, -3.84420097e-02,
        -2.94088701e+01,  2.21181199e-03,  5.72521371e-03,
        -1.62812980e-02],
       [ 2.36342548e-07,  5.45255002e-04,  1.41791164e-04,
         5.48399403e-05,  2.94521670e-03,  5.41985909e-03,
         5.96944667e-03,  7.33481999e-03, -6.52499115e-02,
        -6.37952661e-02],
       [ 1.10161990e-05, -1.79443769e-04,  3.17667171e-04,
         7.95227765e-04,  3.95563487e-03,  5.65594394e-03,
         8.78430120e-04,  4.16905105e-01, -1.15271269e-01,
         1.96183105e-02],
       [-2.03451115e-06, -4.34762308e-04, -7.96714171e-04,
         7.70565380e-04, -1.45161725e-03, -2.15220695e-03,
         1.60609342e-04, -3.40233862e-04, -1.11807258e-03,
        -2.26935824e-02],
       [ 3.57790344e-05,  1.36035276e-03, -3.38130836e-04,
        -3.69448025e-03,  1.28864867e-02,  3.22262145e-02,
         2.01662651e-02,  2.74791793e-02, -4.20028169e-01,
         1.79379838e-02],
       [-1.84410572e-06,  2.33648220e-03, -4.15500962e-03,
         7.18771862e-03, -1.72532950e-03, -7.46013259e-04,
         9.23189092e-04,  3.13223897e-02, -1.09144024e-01,
        -2.59801191e-02],
       [ 1.65685464e-04,  6.34914150e-03, -3.07203834e-03,
        -9.77471385e-03,  4.89383454e-02,  1.38445239e-01,
         6.18427518e-02,  1.28740753e-01, -1.88162456e+00,
        -7.54648243e-02],
       [ 1.65040060e-04,  6.62465579e-03,  3.47727666e-03,
         4.39047323e-03,  1.22372280e-02,  1.67410543e-02,
        -2.89202196e-02,  1.25216535e-01, -1.05834843e-02,
        -3.09892559e-01],
       [ 3.45635372e-06, -3.53993328e-03, -2.04141922e-03,
        -1.63364593e-03, -2.95458276e-03, -9.23052108e-03,
        -3.39180228e-03, -3.04788544e-02,  6.26017047e-02,
         1.19121367e-02],
       [-6.28767562e-05, -5.34943424e-03, -2.15876811e-03,
         1.16489897e-02, -4.48362548e-03, -2.78175734e-02,
        -8.70515398e-03, -2.18600535e-02,  6.87956762e-02,
        -1.04553641e+00],
       [-8.48504596e-07, -8.57175303e-04, -1.20050161e-03,
        -1.32504802e-04,  4.22307359e-05, -2.63724197e-03,
        -1.35970521e-03,  7.43753892e-04,  7.54341097e-03,
        -2.68105229e-02],
       [-8.05072639e-07, -4.67100720e-05, -4.14850069e-04,
         5.76738112e-03, -1.45921365e-04,  8.42416788e-04,
         1.45596364e-04,  3.59991607e-01, -9.72291290e-03,
        -2.10626325e-02],
       [ 2.64144991e-06,  9.53760348e-04,  5.23100523e-04,
        -1.71569409e-03,  1.15190182e-03,  3.73645656e-03,
         2.47572394e-03,  6.70710851e-04, -1.29675611e-02,
         2.96490716e-02],
       [ 1.11096597e-05,  4.96737757e-03,  1.29536779e-03,
        -6.45028644e-04,  4.78993948e-03,  1.30611738e-02,
         3.50836248e-03,  3.68643493e-02, -1.50854097e-01,
        -2.09751685e-02],
       [ 4.81344022e-06,  2.84031729e-03, -3.16758834e-04,
         5.43049417e-05,  6.73061964e-05, -1.37742363e-03,
         1.84439783e-03,  2.52601013e-02, -4.46535426e-02,
        -1.38965453e-02],
       [ 7.47832945e-03,  6.51795129e-03,  1.64308573e-02,
         1.25637826e-01,  3.25865733e+01,  2.86532790e-04,
         7.08444534e+01,  7.38619793e-03, -1.42343131e-02,
        -2.88982756e-02],
       [ 6.23557616e-04, -5.67112176e-03,  2.89629263e-02,
        -1.79496676e-02,  1.16483549e+00,  5.64078759e-01,
         5.53451180e-01, -1.46002908e-01, -7.45136328e-01,
         1.25886000e+00],
       [ 8.61069871e-07, -6.68252553e-04, -1.09494909e-04,
        -6.78180562e-04,  1.24493262e-03,  3.54406601e-04,
         3.40394798e-03, -3.71889311e-05,  3.19244421e-03,
         5.06303247e-03],
       [-1.89206237e-07, -8.24746082e-05, -1.19481680e-04,
         1.48610918e-04, -1.67918979e-04, -2.83055246e-04,
        -4.72054787e-06, -1.29819997e-03,  3.08805704e-03,
        -3.86247014e-03],
       [ 1.09187919e-06, -5.48176235e-04,  4.72045471e-05,
         2.23759511e-04, -6.48902839e-04, -4.76572445e-04,
        -2.44090580e-03, -1.91116942e-03,  2.01185190e-02,
         2.69840500e-02],
       [ 5.54848708e-06,  2.51879682e-04, -2.31918709e-03,
        -3.89329291e-03,  1.26919427e-03,  7.71196736e-03,
         7.20301225e-03, -1.33277978e-02,  6.84462656e-03,
        -3.11711549e-02],
       [ 1.40176296e-06, -1.90086857e-04,  1.52958410e-03,
        -2.04113645e-03,  5.51392252e-03,  5.93292237e-03,
         3.44730322e-03,  5.46870869e-03,  9.26578898e-03,
         7.75404079e-02],
       [-2.01466030e-06, -2.18801256e-04, -8.28197629e-04,
         2.49742418e-02,  2.78897593e-02,  8.05122742e+01,
        -1.91062065e-02, -3.04980551e-01, -1.60657598e-02,
        -1.58859880e-02],
       [-3.40160841e-08,  8.78959369e-05,  1.19949436e-04,
         1.47856857e-05,  7.78244743e-05,  2.05017998e-04,
         6.57728699e-05, -5.35809166e-04, -1.48579553e-03,
        -2.06748794e-03],
       [-1.93437414e-07,  4.43185947e-05,  2.66597182e-05,
         7.42157706e-05,  3.92267849e-05,  7.55971088e-05,
        -1.15161378e-04, -1.23306510e-04, -8.89423408e-04,
        -3.14457169e-04],
       [ 3.44605795e-07, -3.57729545e-06, -6.90532671e-05,
        -8.65221336e-06, -2.71406773e-05,  1.61150603e-03,
         1.01780733e-04, -6.64650894e-04, -3.38176593e-03,
         8.31960605e-03],
       [ 4.94128592e-05, -2.40905549e-03, -3.56386515e-04,
        -1.22635656e-02,  3.40519577e-01,  3.65724017e-02,
         1.05377094e+00, -2.50085749e-02, -1.03974692e+00,
         4.94815029e-01],
       [ 1.78688517e-04,  2.01447341e-03, -2.75929802e-03,
         2.94451067e-03,  1.13194257e-03, -6.33739784e-04,
         7.37104162e-03,  9.50858685e-03, -6.11462496e-02,
        -2.01699001e-02],
       [ 5.98136510e-06,  5.42103711e-04,  1.67449604e-03,
        -1.78335128e-03,  8.97596760e-04,  3.96027662e-03,
         1.70813280e-02, -1.26880737e-03, -7.76775137e-02,
         3.33923803e-02],
       [ 2.60102358e-06,  9.18684941e-04,  1.67527414e-03,
        -1.51271066e-03,  2.15721342e-03,  2.73896129e-03,
        -1.69262564e-03,  1.58473359e-02, -3.20841082e-02,
         4.40758676e-02]])

correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape

(76, 76)

X.index[75]

'B00000K135'

i = X.index[75]

product_names = list(X.index)
product_ID = product_names.index(i) #index구하기
product_ID

75

correlation_product_ID = correlation_matrix[product_ID]
correlation_product_ID.shape

(76,)

correlation_product_ID

array([-0.06077534, -0.66945807, -0.74395009, -0.76411475,  0.25361489,
       -0.03206488, -0.02674622, -0.46812991,  0.34203726, -0.42477089,
        0.50894604, -0.23305399,  0.96679076,  0.74230299,  0.76087043,
        0.99586937, -0.497791  ,  0.91953291,  0.64483058,  0.76841806,
        0.83319006,  0.28461963, -0.61553865,  0.90408135,  0.21303586,
        0.85277249, -0.09136301,  0.55339825,  0.67724018,  0.23664262,
        0.95406914,  0.72680273, -0.27015867, -0.09787857, -0.94811498,
        0.71433134, -0.60801545,  0.90649415,  0.07018666, -0.94533058,
        0.00496   , -0.17478879,  0.33788321,  0.88896022,  0.48294925,
       -0.00897042,  0.0126293 , -0.05168211,  0.4299742 , -0.75363939,
        0.68466366,  0.4999764 ,  0.63073597, -0.57203663, -0.51746143,
       -0.81339383, -0.88719791,  0.20714384,  0.93481353,  0.56821469,
        0.44683368, -0.09660051,  0.69030092,  0.23874399, -0.99832767,
        0.21822018, -0.85777932,  0.72427915, -0.0098365 , -0.29968169,
        0.34002953,  0.89601289,  0.5788912 ,  0.39894377,  0.83075875,
        1.        ])

Recommend = list(X.index[correlation_product_ID > 0.65]) #피어슨 상관계수가 0.65보다 큰 상품들의 index의 list return

Recommend.remove(i) # 이미 구입한 상품은 뺀다

Recommend[0:24] #상위 24개 추천

['7214047977',
 '8862935293',
 '9573212919',
 '9575871979',
 '9888002198',
 '9983891212',
 '9984984354',
 'B000001OM5',
 'B000001ON6',
 'B00000J05A',
 'B00000J08Q',
 'B00000J0D2',
 'B00000J1EP',
 'B00000J1F3',
 'B00000J1U8',
 'B00000J434',
 'B00000JCT8',
 'B00000JDF6',
 'B00000JHWX',
 'B00000JPPI',
 'B00000JYWQ']

Recommend

['7214047977',
 '8862935293',
 '9573212919',
 '9575871979',
 '9888002198',
 '9983891212',
 '9984984354',
 'B000001OM5',
 'B000001ON6',
 'B00000J05A',
 'B00000J08Q',
 'B00000J0D2',
 'B00000J1EP',
 'B00000J1F3',
 'B00000J1U8',
 'B00000J434',
 'B00000JCT8',
 'B00000JDF6',
 'B00000JHWX',
 'B00000JPPI',
 'B00000JYWQ']

	userId	productId	Rating	timestamp
0	AKM1MP6P0OYPR	0132793040	5.0	1365811200
1	A2CX7LUOHB2NDG	0321732944	5.0	1341100800
2	A2NWSAGRHCP8N5	0439886341	1.0	1367193600
3	A2WNBOD3WNDNKT	0439886341	3.0	1374451200
4	A1GI0U4ZRJA8WN	0439886341	1.0	1334707200

	userId	productId	Rating
0	AKM1MP6P0OYPR	0132793040	5.0
1	A2CX7LUOHB2NDG	0321732944	5.0
2	A2NWSAGRHCP8N5	0439886341	1.0
3	A2WNBOD3WNDNKT	0439886341	3.0
4	A1GI0U4ZRJA8WN	0439886341	1.0

userId	A01852072Z7B68UHLI5UG	A0266076X6KPZ6CCHGVS	A0293130VTX2ZXA70JQS	A030530627MK66BD8V4LN	A0571176384K8RBNKGF8O	A0590501PZ7HOWJKBGQ4	A0641581307AKT5MAOU0Q	A076219533YHEV2LJO988	A0821988FXKFYX53V4QG	A099626739FNCRNHIKBCG	...	AZWOPBY75SGAM	AZX0ZDVAFMN78	AZX5LAN9JEAFF	AZX7I110AF0W2	AZXKUK895VGSM	AZXP46IB63PU8	AZYTSU42BZ7TP	AZZGJ2KMWB7R	AZZMV5VT9W7Y8	AZZST8OYL5P4Q
productId
0972683275	0	0	5	4	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1400501466	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1400501520	0	0	0	0	0	0	0	0	0	3	...	0	0	0	0	0	0	0	0	0	0
1400501776	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
1400532620	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

Embeddings에 대한 이해 -2 \| 이미지 기반 유사도, 텍스트 기반 유사도에 대해, TensorFlow Hub (0)	2020.08.11
Embeddings에 대한 이해 -1 \| 이미지 기반 유사도, 텍스트 기반 유사도에 대해 (2)	2020.08.10
추천시스템 Collaborative Filtering - Model based approach (0)	2020.05.31
추천시스템 Recommender System 정리 (0)	2020.05.26
추천시스템 Collaborative Filtering(CF) Python 기반 [4] (0)	2020.05.08

Amazon Reviews를 이용한 추천시스템 실습 - surprise패키지 사용

Missing values 수¶

Ratings¶

Ratings 분석¶

Popularity Based Recommendation¶

Collaborative Filtering 이용¶

1) Data 가져오기¶

2) Trainset, Testset으로 나누기¶

3) KNN을 이용 trainset에 파라미터 맞추기¶

4) Test accuracy 측정¶

Model-based collaborative filtering system¶

'개발 > Recommender System' 카테고리의 다른 글

댓글

티스토리툴바

	userId	productId	Rating	timestamp
183	A1BKC6B7YHYTVV	0972683275	4.0	1405382400
184	AWVFSIB42LHKJ	0972683275	4.0	1405209600
185	A36MQBNADRH8YY	0972683275	5.0	1405641600
186	A3SRXMPLAEZ6DP	0972683275	4.0	1405987200
187	A20XXTXWF2TCPY	0972683275	5.0	1405123200

	Rating
productId
0972683275	4.470980
1400501466	3.560000
1400501520	4.243902
1400501776	3.884892
1400532620	3.684211

Amazon Reviews를 이용한 추천시스템 실습 - surprise패키지 사용

Missing values 수¶

Ratings¶

Ratings 분석¶

Popularity Based Recommendation¶

Collaborative Filtering 이용¶

1) Data 가져오기¶

2) Trainset, Testset으로 나누기¶

3) KNN을 이용 trainset에 파라미터 맞추기¶

4) Test accuracy 측정¶

Model-based collaborative filtering system¶

'개발 > Recommender System' 카테고리의 다른 글

관련글

댓글

티스토리툴바