21 minute read

영화 평점 분석 실습

import pandas as pd
from pandas import Series, DataFrame
import numpy as np

1. 영화 평점 데이터 적재 및 전처리

# 사용자 데이터 읽어오기
users = pd.read_csv('data/movielens/users.dat', sep = '::', engine = 'python',
                   names = ['사용자아이디', '성별','연령','직업','지역'])
사용자아이디 성별 연령 직업 지역
0 1 F 1 10 48067
1 2 M 56 16 70072
2 3 M 25 15 55117
3 4 M 45 7 02460
4 5 M 25 20 55455
# 평점 데이터 읽어오기
ratings = pd.read_csv('data/movielens/ratings.dat', sep = '::', engine = 'python',
                   names = ['사용자아이디', '영화아이디','평점','타임스탬프'])
사용자아이디 영화아이디 평점 타임스탬프
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
# 영화데이터 읽어오기
movies = pd.read_csv('data/movielens/movies.dat', sep = '::', engine = 'python',
                   names = ['영화아이디','영화제목','장르'], encoding = 'latin-1')
영화아이디 영화제목 장르
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy
# 3개의 데이터프레임을 하나로 합치기
data = pd.merge(users, ratings)
data = pd.merge(data, movies)
사용자아이디 성별 연령 직업 지역 영화아이디 평점 타임스탬프 영화제목 장르
0 1 F 1 10 48067 1193 5 978300760 One Flew Over the Cuckoo's Nest (1975) Drama
1 2 M 56 16 70072 1193 5 978298413 One Flew Over the Cuckoo's Nest (1975) Drama
2 12 M 25 12 32793 1193 4 978220179 One Flew Over the Cuckoo's Nest (1975) Drama
3 15 M 25 7 22903 1193 4 978199279 One Flew Over the Cuckoo's Nest (1975) Drama
4 17 M 50 1 95350 1193 5 978158471 One Flew Over the Cuckoo's Nest (1975) Drama
print('사용자 수:', len(users))
print('리뷰 수:', len(ratings)) #ratings.사용자아이디.count()
print('영화 수:', len(movies))
사용자 수: 6040
리뷰 수: 1000209
영화 수: 3883
# 모든 사용자가 리뷰를 했나?
ratings.사용자아이디.nunique() #모든 사용자가 영화 리뷰에 참여!
# 리뷰가 없는 영화는 있는가?
ratings.nunique() #영화 수는 3883, 영화 아이디는 3706으로, 176개는 영화 리뷰가 없음!
사용자아이디      6040
영화아이디       3706
평점             5
타임스탬프     458455
dtype: int64

2. 보고 싶은 영화 찾기

영화들의 평점 평균을 구하여, 사람들에게 인정받는 (평점이 높은) 영화 찾기

사용자아이디 성별 연령 직업 지역 영화아이디 평점 타임스탬프 영화제목 장르
0 1 F 1 10 48067 1193 5 978300760 One Flew Over the Cuckoo's Nest (1975) Drama
1 2 M 56 16 70072 1193 5 978298413 One Flew Over the Cuckoo's Nest (1975) Drama
2 12 M 25 12 32793 1193 4 978220179 One Flew Over the Cuckoo's Nest (1975) Drama
3 15 M 25 7 22903 1193 4 978199279 One Flew Over the Cuckoo's Nest (1975) Drama
4 17 M 50 1 95350 1193 5 978158471 One Flew Over the Cuckoo's Nest (1975) Drama
# 영화들의 평점 평균을 구하여, 평점이 높은 영화 찾기 
data.pivot_table(index='영화제목', aggfunc='mean', values='평점')\
                 .sort_values(by='평점', ascending=False).head(10)
Ulysses (Ulisse) (1954) 5.0
Lured (1947) 5.0
Follow the Bitch (1998) 5.0
Bittersweet Motel (2000) 5.0
Song of Freedom (1936) 5.0
One Little Indian (1973) 5.0
Smashing Time (1967) 5.0
Schlafes Bruder (Brother of Sleep) (1995) 5.0
Gate of Heavenly Peace, The (1995) 5.0
Baby, The (1973) 5.0
data.pivot_table(index='영화제목', aggfunc='mean', values='평점').nlargest(10, '평점') #nsmallest
Baby, The (1973) 5.0
Bittersweet Motel (2000) 5.0
Follow the Bitch (1998) 5.0
Gate of Heavenly Peace, The (1995) 5.0
Lured (1947) 5.0
One Little Indian (1973) 5.0
Schlafes Bruder (Brother of Sleep) (1995) 5.0
Smashing Time (1967) 5.0
Song of Freedom (1936) 5.0
Ulysses (Ulisse) (1954) 5.0

평균 평점이 만점인 영화들이 최상위에 위치함. 일반적으로 평점이 만점인 경우는 대부분 평점의 개수가 매우 적은 경우이므로, 이를 확인하기 위해 평점의 개수도 함께 구해본다.

#영화제목도 중복될 수 있으므로, 제대로 하려면 영화아이디로 해야 됨
# 중복된 영화제목이 있는지 확인

movies.영화제목.nunique() #위에서 구한 영화 수와 일치하므로 중복X
영화아이디    3883
영화제목     3883
장르        301
dtype: int64
#data.pivot_table(index=['영화아이디','영화제목'], aggfunc='mean', values='평점')\ 
#                 .nlargest(10, '평점') #nsmallest
data.pivot_table(index='영화제목', aggfunc=['mean','count'], values='평점')
mean count
평점 평점
$1,000,000 Duck (1971) 3.027027 37
'Night Mother (1986) 3.371429 70
'Til There Was You (1997) 2.692308 52
'burbs, The (1989) 2.910891 303
...And Justice for All (1979) 3.713568 199
... ... ...
Zed & Two Noughts, A (1985) 3.413793 29
Zero Effect (1998) 3.750831 301
Zero Kelvin (Kjærlighetens kjøtere) (1995) 3.500000 2
Zeus and Roxanne (1997) 2.521739 23
eXistenZ (1999) 3.256098 410

3706 rows × 2 columns

data.pivot_table(index='영화제목', aggfunc=['mean','count'], values='평점')\
.nlargest(10, ('mean','평점')) #평점 높고, 개수도 많은 걸 찾자!
mean count
평점 평점
Baby, The (1973) 5.0 1
Bittersweet Motel (2000) 5.0 1
Follow the Bitch (1998) 5.0 1
Gate of Heavenly Peace, The (1995) 5.0 3
Lured (1947) 5.0 1
One Little Indian (1973) 5.0 1
Schlafes Bruder (Brother of Sleep) (1995) 5.0 1
Smashing Time (1967) 5.0 2
Song of Freedom (1936) 5.0 1
Ulysses (Ulisse) (1954) 5.0 1
#평점 평균이 4.5 이상이고, 평점의 개수가 1000개 이상인 영화를 보고싶은 영화로 지정!
영화평점 = data.pivot_table(index='영화제목', aggfunc=['mean','count'], values='평점')
평점평균 평점개수
$1,000,000 Duck (1971) 3.027027 37
'Night Mother (1986) 3.371429 70
'Til There Was You (1997) 2.692308 52
'burbs, The (1989) 2.910891 303
...And Justice for All (1979) 3.713568 199
... ... ...
Zed & Two Noughts, A (1985) 3.413793 29
Zero Effect (1998) 3.750831 301
Zero Kelvin (Kjærlighetens kjøtere) (1995) 3.500000 2
Zeus and Roxanne (1997) 2.521739 23
eXistenZ (1999) 3.256098 410

3706 rows × 2 columns

영화평점[(영화평점.평점평균 >= 4.5) & (영화평점.평점개수 >=1000)]
평점평균 평점개수
Godfather, The (1972) 4.524966 2223
Schindler's List (1993) 4.510417 2304
Shawshank Redemption, The (1994) 4.554558 2227
Usual Suspects, The (1995) 4.517106 1783

[실습 #1] 여자들이 좋아하는 영화 찾기

- 여성 평점이 4.0 이상이고 여성 평점의 개수가 500개 이상인 영화

data_f = data[data['성별']=='F']
사용자아이디 성별 연령 직업 지역 영화아이디 평점 타임스탬프 영화제목 장르
0 1 F 1 10 48067 1193 5 978300760 One Flew Over the Cuckoo's Nest (1975) Drama
5 18 F 18 3 95825 1193 4 978156168 One Flew Over the Cuckoo's Nest (1975) Drama
7 24 F 25 7 10023 1193 5 978136709 One Flew Over the Cuckoo's Nest (1975) Drama
8 28 F 25 1 14607 1193 3 978125194 One Flew Over the Cuckoo's Nest (1975) Drama
19 59 F 50 1 55413 1193 4 977934292 One Flew Over the Cuckoo's Nest (1975) Drama
... ... ... ... ... ... ... ... ... ... ...
1000199 5334 F 56 13 46140 3382 5 960796159 Song of Freedom (1936) Drama
1000200 5420 F 1 19 14850 1843 3 960156505 Slappy and the Stinkers (1998) Children's|Comedy
1000201 5433 F 35 17 45014 286 3 960240881 Nemesis 2: Nebula (1995) Action|Sci-Fi|Thriller
1000202 5494 F 35 17 94306 3530 4 959816296 Smoking/No Smoking (1993) Comedy
1000207 5851 F 18 20 55410 3607 5 957756608 One Little Indian (1973) Comedy|Drama|Western

246440 rows × 10 columns

여성평점 = data_f.pivot_table(index='영화제목', aggfunc=['mean','count'], values='평점')
여성평점[(여성평점.평점평균 >= 4.0) & (여성평점.평점개수 >=500)]
평점평균 평점개수
American Beauty (1999) 4.238901 946
Being John Malkovich (1999) 4.159930 569
Braveheart (1995) 4.016484 546
Casablanca (1942) 4.300990 505
E.T. the Extra-Terrestrial (1982) 4.089850 601
Fargo (1996) 4.217656 657
Forrest Gump (1994) 4.045031 644
L.A. Confidential (1997) 4.106007 566
Matrix, The (1999) 4.128405 514
Princess Bride, The (1987) 4.342767 636
Pulp Fiction (1994) 4.071956 542
Raiders of the Lost Ark (1981) 4.332168 572
Saving Private Ryan (1998) 4.114783 575
Schindler's List (1993) 4.562602 615
Shakespeare in Love (1998) 4.181704 798
Shawshank Redemption, The (1994) 4.539075 627
Silence of the Lambs, The (1991) 4.271955 706
Sixth Sense, The (1999) 4.477410 664
Star Wars: Episode IV - A New Hope (1977) 4.302937 647
Star Wars: Episode V - The Empire Strikes Back (1980) 4.106481 648
Toy Story (1995) 4.187817 591
Wizard of Oz, The (1939) 4.355030 507

                                  aggfunc=['mean','count'], values='평점')
mean count
평점 평점
$1,000,000 Duck (1971) 3.375000 16
'Night Mother (1986) 3.388889 36
'Til There Was You (1997) 2.675676 37
'burbs, The (1989) 2.793478 92
...And Justice for All (1979) 3.828571 35
... ... ...
Your Friends and Neighbors (1998) 2.888889 27
Zed & Two Noughts, A (1985) 3.500000 8
Zero Effect (1998) 3.864407 59
Zeus and Roxanne (1997) 2.777778 9
eXistenZ (1999) 3.098592 71

3481 rows × 2 columns

ex1 = data.pivot_table(index='영화제목', columns='성별',                       
                       aggfunc=['mean','count'], values='평점')
mean count
성별 F M F M
$1,000,000 Duck (1971) 3.375000 2.761905 16.0 21.0
'Night Mother (1986) 3.388889 3.352941 36.0 34.0
'Til There Was You (1997) 2.675676 2.733333 37.0 15.0
'burbs, The (1989) 2.793478 2.962085 92.0 211.0
...And Justice for All (1979) 3.828571 3.689024 35.0 164.0
... ... ... ... ...
Zed & Two Noughts, A (1985) 3.500000 3.380952 8.0 21.0
Zero Effect (1998) 3.864407 3.723140 59.0 242.0
Zero Kelvin (Kjærlighetens kjøtere) (1995) NaN 3.500000 NaN 2.0
Zeus and Roxanne (1997) 2.777778 2.357143 9.0 14.0
eXistenZ (1999) 3.098592 3.289086 71.0 339.0

3706 rows × 4 columns

ex1[(ex1[('mean','F')]>= 4.0) & (ex1[('count','F')] >=500)]
mean count
성별 F M F M
American Beauty (1999) 4.238901 4.347301 946.0 2482.0
Being John Malkovich (1999) 4.159930 4.113636 569.0 1672.0
Braveheart (1995) 4.016484 4.297839 546.0 1897.0
Casablanca (1942) 4.300990 4.461340 505.0 1164.0
E.T. the Extra-Terrestrial (1982) 4.089850 3.920264 601.0 1668.0
Fargo (1996) 4.217656 4.267780 657.0 1856.0
Forrest Gump (1994) 4.045031 4.105806 644.0 1550.0
L.A. Confidential (1997) 4.106007 4.256678 566.0 1722.0
Matrix, The (1999) 4.128405 4.362235 514.0 2076.0
Princess Bride, The (1987) 4.342767 4.288942 636.0 1682.0
Pulp Fiction (1994) 4.071956 4.346839 542.0 1629.0
Raiders of the Lost Ark (1981) 4.332168 4.520597 572.0 1942.0
Saving Private Ryan (1998) 4.114783 4.398941 575.0 2078.0
Schindler's List (1993) 4.562602 4.491415 615.0 1689.0
Shakespeare in Love (1998) 4.181704 4.099936 798.0 1571.0
Shawshank Redemption, The (1994) 4.539075 4.560625 627.0 1600.0
Silence of the Lambs, The (1991) 4.271955 4.381944 706.0 1872.0
Sixth Sense, The (1999) 4.477410 4.379944 664.0 1795.0
Star Wars: Episode IV - A New Hope (1977) 4.302937 4.495307 647.0 2344.0
Star Wars: Episode V - The Empire Strikes Back (1980) 4.106481 4.344577 648.0 2342.0
Toy Story (1995) 4.187817 4.130552 591.0 1486.0
Wizard of Oz, The (1939) 4.355030 4.203138 507.0 1211.0

[실습 #2] 실습 #1에서 구한 영화(여성인기영화)의 장르를 분석해 보자.

여성인기영화의 장르 통계 구하기

예를 들어, 여성인기영화 중 Drama 장르의 영화는 10개, Action 영화는 3개, …

여성인기영화 = 여성평점[(여성평점.평점평균 >= 4.0) & (여성평점.평점개수 >=500)]
평점평균 평점개수
American Beauty (1999) 4.238901 946
Being John Malkovich (1999) 4.159930 569
Braveheart (1995) 4.016484 546
Casablanca (1942) 4.300990 505
E.T. the Extra-Terrestrial (1982) 4.089850 601
Fargo (1996) 4.217656 657
Forrest Gump (1994) 4.045031 644
L.A. Confidential (1997) 4.106007 566
Matrix, The (1999) 4.128405 514
Princess Bride, The (1987) 4.342767 636
Pulp Fiction (1994) 4.071956 542
Raiders of the Lost Ark (1981) 4.332168 572
Saving Private Ryan (1998) 4.114783 575
Schindler's List (1993) 4.562602 615
Shakespeare in Love (1998) 4.181704 798
Shawshank Redemption, The (1994) 4.539075 627
Silence of the Lambs, The (1991) 4.271955 706
Sixth Sense, The (1999) 4.477410 664
Star Wars: Episode IV - A New Hope (1977) 4.302937 647
Star Wars: Episode V - The Empire Strikes Back (1980) 4.106481 648
Toy Story (1995) 4.187817 591
Wizard of Oz, The (1939) 4.355030 507
Index(['American Beauty (1999)', 'Being John Malkovich (1999)',
       'Braveheart (1995)', 'Casablanca (1942)',
       'E.T. the Extra-Terrestrial (1982)', 'Fargo (1996)',
       'Forrest Gump (1994)', 'L.A. Confidential (1997)', 'Matrix, The (1999)',
       'Princess Bride, The (1987)', 'Pulp Fiction (1994)',
       'Raiders of the Lost Ark (1981)', 'Saving Private Ryan (1998)',
       'Schindler's List (1993)', 'Shakespeare in Love (1998)',
       'Shawshank Redemption, The (1994)', 'Silence of the Lambs, The (1991)',
       'Sixth Sense, The (1999)', 'Star Wars: Episode IV - A New Hope (1977)',
       'Star Wars: Episode V - The Empire Strikes Back (1980)',
       'Toy Story (1995)', 'Wizard of Oz, The (1939)'],
      dtype='object', name='영화제목')
array(['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)',
       ..., 'Tigerland (2000)', 'Two Family House (2000)',
       'Contender, The (2000)'], dtype=object)
for i in 여성인기영화.index:
    if i in movies.영화제목.values:
['American Beauty (1999)',
 'Being John Malkovich (1999)',
 'Braveheart (1995)',
 'Casablanca (1942)',
 'E.T. the Extra-Terrestrial (1982)',
 'Fargo (1996)',
 'Forrest Gump (1994)',
 'L.A. Confidential (1997)',
 'Matrix, The (1999)',
 'Princess Bride, The (1987)',
 'Pulp Fiction (1994)',
 'Raiders of the Lost Ark (1981)',
 'Saving Private Ryan (1998)',
 "Schindler's List (1993)",
 'Shakespeare in Love (1998)',
 'Shawshank Redemption, The (1994)',
 'Silence of the Lambs, The (1991)',
 'Sixth Sense, The (1999)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Toy Story (1995)',
 'Wizard of Oz, The (1939)']
여성인기영화장르 = movies[movies.영화제목.isin(lst)]
영화아이디 영화제목 장르
0 1 Toy Story (1995) Animation|Children's|Comedy
108 110 Braveheart (1995) Action|Drama|War
257 260 Star Wars: Episode IV - A New Hope (1977) Action|Adventure|Fantasy|Sci-Fi
293 296 Pulp Fiction (1994) Crime|Drama
315 318 Shawshank Redemption, The (1994) Drama
352 356 Forrest Gump (1994) Comedy|Romance|War
523 527 Schindler's List (1993) Drama|War
589 593 Silence of the Lambs, The (1991) Drama|Thriller
604 608 Fargo (1996) Crime|Drama|Thriller
900 912 Casablanca (1942) Drama|Romance|War
907 919 Wizard of Oz, The (1939) Adventure|Children's|Drama|Musical
1081 1097 E.T. the Extra-Terrestrial (1982) Children's|Drama|Fantasy|Sci-Fi
1178 1196 Star Wars: Episode V - The Empire Strikes Back... Action|Adventure|Drama|Sci-Fi|War
1179 1197 Princess Bride, The (1987) Action|Adventure|Comedy|Romance
1180 1198 Raiders of the Lost Ark (1981) Action|Adventure
1575 1617 L.A. Confidential (1997) Crime|Film-Noir|Mystery|Thriller
1959 2028 Saving Private Ryan (1998) Action|Drama|War
2327 2396 Shakespeare in Love (1998) Comedy|Romance
2502 2571 Matrix, The (1999) Action|Sci-Fi|Thriller
2693 2762 Sixth Sense, The (1999) Thriller
2789 2858 American Beauty (1999) Comedy|Drama
2928 2997 Being John Malkovich (1999) Comedy
k =[]
for i in 여성인기영화장르.장르:
    for j in i.split('|'):
for i in k:
    try: count[i] += 1
    except: count[i]=1
{'Animation': 1, "Children's": 3, 'Comedy': 6, 'Action': 7, 'Drama': 12, 'War': 6, 'Adventure': 5, 'Fantasy': 2, 'Sci-Fi': 4, 'Crime': 3, 'Romance': 4, 'Thriller': 5, 'Musical': 1, 'Film-Noir': 1, 'Mystery': 1}

ex2 = ex1[(ex1[('mean','F')]>= 4.0) & (ex1[('count','F')] >=500)].index
Index(['American Beauty (1999)', 'Being John Malkovich (1999)',
       'Braveheart (1995)', 'Casablanca (1942)',
       'E.T. the Extra-Terrestrial (1982)', 'Fargo (1996)',
       'Forrest Gump (1994)', 'L.A. Confidential (1997)', 'Matrix, The (1999)',
       'Princess Bride, The (1987)', 'Pulp Fiction (1994)',
       'Raiders of the Lost Ark (1981)', 'Saving Private Ryan (1998)',
       'Schindler's List (1993)', 'Shakespeare in Love (1998)',
       'Shawshank Redemption, The (1994)', 'Silence of the Lambs, The (1991)',
       'Sixth Sense, The (1999)', 'Star Wars: Episode IV - A New Hope (1977)',
       'Star Wars: Episode V - The Empire Strikes Back (1980)',
       'Toy Story (1995)', 'Wizard of Oz, The (1939)'],
      dtype='object', name='영화제목')
ex2_df = movies[movies.영화제목.isin(ex2)]
영화아이디 영화제목 장르
0 1 Toy Story (1995) Animation|Children's|Comedy
108 110 Braveheart (1995) Action|Drama|War
257 260 Star Wars: Episode IV - A New Hope (1977) Action|Adventure|Fantasy|Sci-Fi
293 296 Pulp Fiction (1994) Crime|Drama
315 318 Shawshank Redemption, The (1994) Drama
352 356 Forrest Gump (1994) Comedy|Romance|War
523 527 Schindler's List (1993) Drama|War
589 593 Silence of the Lambs, The (1991) Drama|Thriller
604 608 Fargo (1996) Crime|Drama|Thriller
900 912 Casablanca (1942) Drama|Romance|War
907 919 Wizard of Oz, The (1939) Adventure|Children's|Drama|Musical
1081 1097 E.T. the Extra-Terrestrial (1982) Children's|Drama|Fantasy|Sci-Fi
1178 1196 Star Wars: Episode V - The Empire Strikes Back... Action|Adventure|Drama|Sci-Fi|War
1179 1197 Princess Bride, The (1987) Action|Adventure|Comedy|Romance
1180 1198 Raiders of the Lost Ark (1981) Action|Adventure
1575 1617 L.A. Confidential (1997) Crime|Film-Noir|Mystery|Thriller
1959 2028 Saving Private Ryan (1998) Action|Drama|War
2327 2396 Shakespeare in Love (1998) Comedy|Romance
2502 2571 Matrix, The (1999) Action|Sci-Fi|Thriller
2693 2762 Sixth Sense, The (1999) Thriller
2789 2858 American Beauty (1999) Comedy|Drama
2928 2997 Being John Malkovich (1999) Comedy
0               [Animation, Children's, Comedy]
108                        [Action, Drama, War]
257        [Action, Adventure, Fantasy, Sci-Fi]
293                              [Crime, Drama]
315                                     [Drama]
352                      [Comedy, Romance, War]
523                                [Drama, War]
589                           [Drama, Thriller]
604                    [Crime, Drama, Thriller]
900                       [Drama, Romance, War]
907     [Adventure, Children's, Drama, Musical]
1081       [Children's, Drama, Fantasy, Sci-Fi]
1178    [Action, Adventure, Drama, Sci-Fi, War]
1179       [Action, Adventure, Comedy, Romance]
1180                        [Action, Adventure]
1575      [Crime, Film-Noir, Mystery, Thriller]
1959                       [Action, Drama, War]
2327                          [Comedy, Romance]
2502                 [Action, Sci-Fi, Thriller]
2693                                 [Thriller]
2789                            [Comedy, Drama]
2928                                   [Comedy]
Name: 장르, dtype: object
ex2_df = ex2_df.장르.str.split('|', expand=True)
0 1 2 3 4
0 Animation Children's Comedy None None
108 Action Drama War None None
257 Action Adventure Fantasy Sci-Fi None
293 Crime Drama None None None
315 Drama None None None None
352 Comedy Romance War None None
523 Drama War None None None
589 Drama Thriller None None None
604 Crime Drama Thriller None None
900 Drama Romance War None None
907 Adventure Children's Drama Musical None
1081 Children's Drama Fantasy Sci-Fi None
1178 Action Adventure Drama Sci-Fi War
1179 Action Adventure Comedy Romance None
1180 Action Adventure None None None
1575 Crime Film-Noir Mystery Thriller None
1959 Action Drama War None None
2327 Comedy Romance None None None
2502 Action Sci-Fi Thriller None None
2693 Thriller None None None None
2789 Comedy Drama None None None
2928 Comedy None None None None
장르1 = ex2_df[0].value_counts()
장르2 = ex2_df[1].value_counts()
장르3 = ex2_df[2].value_counts()
장르4 = ex2_df[3].value_counts()
장르5 = ex2_df[4].value_counts()
장르1.add(장르2, fill_value=0).add(장르3, fill_value=0).add(장르4, fill_value=0).add(장르5, fill_value=0)
Action         7.0
Adventure      5.0
Animation      1.0
Children's     3.0
Comedy         6.0
Crime          3.0
Drama         12.0
Fantasy        2.0
Film-Noir      1.0
Musical        1.0
Mystery        1.0
Romance        4.0
Sci-Fi         4.0
Thriller       5.0
War            6.0
dtype: float64
sr = Series()
for col in ex2_df.columns:
    sr = sr.add(ex2_df[col].value_counts(), fill_value=0)
C:\anaconda\envs\test3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
  """Entry point for launching an IPython kernel.

Action         7.0
Adventure      5.0
Animation      1.0
Children's     3.0
Comedy         6.0
Crime          3.0
Drama         12.0
Fantasy        2.0
Film-Noir      1.0
Musical        1.0
Mystery        1.0
Romance        4.0
Sci-Fi         4.0
Thriller       5.0
War            6.0
dtype: float64
Drama         12.0
Action         7.0
Comedy         6.0
War            6.0
Adventure      5.0
Thriller       5.0
Romance        4.0
Sci-Fi         4.0
Children's     3.0
Crime          3.0
Fantasy        2.0
Animation      1.0
Film-Noir      1.0
Musical        1.0
Mystery        1.0
dtype: float64

[실습 #3] 남자와 여자의 호불호가 크게 갈리는 영화 10개 찾기

전체 평점의 개수가 500개 이상인 영화만 대상으로 함.

data_f = data[data['성별']=='F']
사용자아이디 성별 연령 직업 지역 영화아이디 평점 타임스탬프 영화제목 장르
0 1 F 1 10 48067 1193 5 978300760 One Flew Over the Cuckoo's Nest (1975) Drama
5 18 F 18 3 95825 1193 4 978156168 One Flew Over the Cuckoo's Nest (1975) Drama
7 24 F 25 7 10023 1193 5 978136709 One Flew Over the Cuckoo's Nest (1975) Drama
8 28 F 25 1 14607 1193 3 978125194 One Flew Over the Cuckoo's Nest (1975) Drama
19 59 F 50 1 55413 1193 4 977934292 One Flew Over the Cuckoo's Nest (1975) Drama
... ... ... ... ... ... ... ... ... ... ...
1000199 5334 F 56 13 46140 3382 5 960796159 Song of Freedom (1936) Drama
1000200 5420 F 1 19 14850 1843 3 960156505 Slappy and the Stinkers (1998) Children's|Comedy
1000201 5433 F 35 17 45014 286 3 960240881 Nemesis 2: Nebula (1995) Action|Sci-Fi|Thriller
1000202 5494 F 35 17 94306 3530 4 959816296 Smoking/No Smoking (1993) Comedy
1000207 5851 F 18 20 55410 3607 5 957756608 One Little Indian (1973) Comedy|Drama|Western

246440 rows × 10 columns

여성평점 = data_f.pivot_table(index='영화제목', aggfunc=['mean','count'], values='평점')
여성평점 = 여성평점[여성평점.평점개수_f >=500]
평점평균_f 평점개수_f
American Beauty (1999) 4.238901 946
Babe (1995) 3.953368 579
Back to the Future (1985) 3.932707 639
Being John Malkovich (1999) 4.159930 569
Braveheart (1995) 4.016484 546
Casablanca (1942) 4.300990 505
E.T. the Extra-Terrestrial (1982) 4.089850 601
Fargo (1996) 4.217656 657
Forrest Gump (1994) 4.045031 644
Ghostbusters (1984) 3.833962 530
Groundhog Day (1993) 3.735562 658
Jurassic Park (1993) 3.579407 573
L.A. Confidential (1997) 4.106007 566
Matrix, The (1999) 4.128405 514
Men in Black (1997) 3.817844 538
Princess Bride, The (1987) 4.342767 636
Pulp Fiction (1994) 4.071956 542
Raiders of the Lost Ark (1981) 4.332168 572
Saving Private Ryan (1998) 4.114783 575
Schindler's List (1993) 4.562602 615
Shakespeare in Love (1998) 4.181704 798
Shawshank Redemption, The (1994) 4.539075 627
Silence of the Lambs, The (1991) 4.271955 706
Sixth Sense, The (1999) 4.477410 664
Star Wars: Episode IV - A New Hope (1977) 4.302937 647
Star Wars: Episode V - The Empire Strikes Back (1980) 4.106481 648
Star Wars: Episode VI - Return of the Jedi (1983) 3.865237 653
Toy Story (1995) 4.187817 591
Wizard of Oz, The (1939) 4.355030 507
data_m = data[data['성별']=='M']
사용자아이디 성별 연령 직업 지역 영화아이디 평점 타임스탬프 영화제목 장르
1 2 M 56 16 70072 1193 5 978298413 One Flew Over the Cuckoo's Nest (1975) Drama
2 12 M 25 12 32793 1193 4 978220179 One Flew Over the Cuckoo's Nest (1975) Drama
3 15 M 25 7 22903 1193 4 978199279 One Flew Over the Cuckoo's Nest (1975) Drama
4 17 M 50 1 95350 1193 5 978158471 One Flew Over the Cuckoo's Nest (1975) Drama
6 19 M 1 10 48073 1193 5 982730936 One Flew Over the Cuckoo's Nest (1975) Drama
... ... ... ... ... ... ... ... ... ... ...
1000203 5556 M 45 6 92103 2198 3 959445515 Modulations (1998) Documentary
1000204 5949 M 18 17 47901 2198 5 958846401 Modulations (1998) Documentary
1000205 5675 M 35 14 30030 2703 3 976029116 Broken Vessels (1998) Drama
1000206 5780 M 18 17 92886 2845 1 958153068 White Boys (1999) Drama
1000208 5938 M 25 1 35401 2909 4 957273353 Five Wives, Three Secretaries and Me (1998) Documentary

753769 rows × 10 columns

남성평점 = data_m.pivot_table(index='영화제목', aggfunc=['mean','count'], values='평점')
남성평점 = 남성평점[남성평점.평점개수_m >=500]
평점평균_m 평점개수_m
13th Warrior, The (1999) 3.168000 625
2001: A Space Odyssey (1968) 4.129738 1372
Abyss, The (1989) 3.689507 1401
Ace Ventura: Pet Detective (1994) 3.197917 576
Addams Family, The (1991) 3.163498 526
... ... ...
Wrong Trousers, The (1993) 4.478261 644
X-Files: Fight the Future, The (1998) 3.493797 806
X-Men (2000) 3.851702 1234
You've Got Mail (1998) 3.275591 508
Young Frankenstein (1974) 4.239177 924

436 rows × 2 columns

여남평점 = pd.concat([여성평점, 남성평점],axis=1)
여남평점['평점차이'] = abs(여남평점['평점평균_f']-여남평점['평점평균_m'])
여남평점.nlargest(10, ('평점차이'))
평점평균_f 평점개수_f 평점평균_m 평점개수_m 평점차이
Groundhog Day (1993) 3.735562 658.0 4.041358 1620 0.305796
Saving Private Ryan (1998) 4.114783 575.0 4.398941 2078 0.284159
Braveheart (1995) 4.016484 546.0 4.297839 1897 0.281355
Pulp Fiction (1994) 4.071956 542.0 4.346839 1629 0.274883
Star Wars: Episode V - The Empire Strikes Back (1980) 4.106481 648.0 4.344577 2342 0.238096
Jurassic Park (1993) 3.579407 573.0 3.814197 2099 0.234791
Matrix, The (1999) 4.128405 514.0 4.362235 2076 0.233830
Star Wars: Episode VI - Return of the Jedi (1983) 3.865237 653.0 4.069058 2230 0.203821
Star Wars: Episode IV - A New Hope (1977) 4.302937 647.0 4.495307 2344 0.192371
Raiders of the Lost Ark (1981) 4.332168 572.0 4.520597 1942 0.188429

ex3 = data.pivot_table(index = '영화제목', columns= '성별',
                      values = '평점', aggfunc = ['mean', 'count'])
ex3 = ex3[ex3[('count','F')]+ex3[('count','M')]>=500]
mean count
성별 F M F M
10 Things I Hate About You (1999) 3.646552 3.311966 232.0 468.0
101 Dalmatians (1961) 3.791444 3.500000 187.0 378.0
12 Angry Men (1957) 4.184397 4.328421 141.0 475.0
13th Warrior, The (1999) 3.112000 3.168000 125.0 625.0
20,000 Leagues Under the Sea (1954) 3.670103 3.709205 97.0 478.0
... ... ... ... ...
X-Files: Fight the Future, The (1998) 3.489474 3.493797 190.0 806.0
X-Men (2000) 3.682310 3.851702 277.0 1234.0
You've Got Mail (1998) 3.542424 3.275591 330.0 508.0
Young Frankenstein (1974) 4.289963 4.239177 269.0 924.0
Young Guns (1988) 3.371795 3.425620 78.0 484.0

618 rows × 4 columns

ex3[ex3['count'].sum(axis=1) >= 500]
mean count
성별 F M F M
10 Things I Hate About You (1999) 3.646552 3.311966 232.0 468.0
101 Dalmatians (1961) 3.791444 3.500000 187.0 378.0
12 Angry Men (1957) 4.184397 4.328421 141.0 475.0
13th Warrior, The (1999) 3.112000 3.168000 125.0 625.0
20,000 Leagues Under the Sea (1954) 3.670103 3.709205 97.0 478.0
... ... ... ... ...
X-Files: Fight the Future, The (1998) 3.489474 3.493797 190.0 806.0
X-Men (2000) 3.682310 3.851702 277.0 1234.0
You've Got Mail (1998) 3.542424 3.275591 330.0 508.0
Young Frankenstein (1974) 4.289963 4.239177 269.0 924.0
Young Guns (1988) 3.371795 3.425620 78.0 484.0

618 rows × 4 columns

abs(ex3[('mean','F')] - ex3[('mean','M')]).nlargest(10)
Dirty Dancing (1987)                      0.830782
Good, The Bad and The Ugly, The (1966)    0.726351
Dumb & Dumber (1994)                      0.638608
Evil Dead II (Dead By Dawn) (1987)        0.611985
Grease (1978)                             0.608224
Caddyshack (1980)                         0.573602
Animal House (1978)                       0.538286
Exorcist, The (1973)                      0.529605
Rocky Horror Picture Show, The (1975)     0.512885
Big Trouble in Little China (1986)        0.497078
dtype: float64

[실습 #4] 연령대 별로 영화 평점 분석하기

연령대(10대 미만, 10대, 20대, …50대) 컬럼을 추가한 후, 영화별 연령대별 영화평점 구하기

def generate_ages(x):
    if x<10:
        return '10대 미만'
    elif x<20:
        return '10대'
    elif x<30:
        return '20대'
    elif x<40:
        return '30대'
    elif x<50:
        return '40대'
    elif x<60:
        return '50대 이상'
'50대 이상'
data['연령대'] = data.연령.apply(generate_ages)
사용자아이디 성별 연령 직업 지역 영화아이디 평점 타임스탬프 영화제목 장르 연령대
0 1 F 1 10 48067 1193 5 978300760 One Flew Over the Cuckoo's Nest (1975) Drama 10대 미만
1 2 M 56 16 70072 1193 5 978298413 One Flew Over the Cuckoo's Nest (1975) Drama 50대 이상
2 12 M 25 12 32793 1193 4 978220179 One Flew Over the Cuckoo's Nest (1975) Drama 20대
3 15 M 25 7 22903 1193 4 978199279 One Flew Over the Cuckoo's Nest (1975) Drama 20대
4 17 M 50 1 95350 1193 5 978158471 One Flew Over the Cuckoo's Nest (1975) Drama 50대 이상
... ... ... ... ... ... ... ... ... ... ... ...
1000204 5949 M 18 17 47901 2198 5 958846401 Modulations (1998) Documentary 10대
1000205 5675 M 35 14 30030 2703 3 976029116 Broken Vessels (1998) Drama 30대
1000206 5780 M 18 17 92886 2845 1 958153068 White Boys (1999) Drama 10대
1000207 5851 F 18 20 55410 3607 5 957756608 One Little Indian (1973) Comedy|Drama|Western 10대
1000208 5938 M 25 1 35401 2909 4 957273353 Five Wives, Three Secretaries and Me (1998) Documentary 20대

1000209 rows × 11 columns

연령평점 = data.pivot_table(index='영화제목', columns='연령대', 
                        aggfunc=['mean'], values='평점')
연령대 10대 10대 미만 20대 30대 40대 50대 이상
$1,000,000 Duck (1971) 3.000000 NaN 3.090909 3.133333 2.000000 2.750000
'Night Mother (1986) 4.666667 2.000000 3.423077 2.904762 3.833333 3.750000
'Til There Was You (1997) 2.500000 3.500000 2.666667 2.900000 2.333333 2.600000
'burbs, The (1989) 3.244444 4.500000 2.652174 2.818182 2.545455 3.100000
...And Justice for All (1979) 3.428571 3.000000 3.724138 3.657143 4.100000 3.674419
... ... ... ... ... ... ...
Zed & Two Noughts, A (1985) 3.000000 1.000000 3.375000 3.777778 4.000000 3.000000
Zero Effect (1998) 3.883333 4.125000 3.715278 3.608696 3.764706 3.769231
Zero Kelvin (Kjærlighetens kjøtere) (1995) NaN NaN NaN 3.500000 NaN NaN
Zeus and Roxanne (1997) 2.500000 1.500000 2.833333 3.500000 1.000000 NaN
eXistenZ (1999) 3.289157 3.142857 3.234973 3.364865 3.222222 3.103448

3706 rows × 6 columns