(판다스 시각화) 영화 평점 분석
영화 평점 분석 실습
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
1. 영화 평점 데이터 적재 및 전처리
# 사용자 데이터 읽어오기
users = pd.read_csv('data/movielens/users.dat', sep = '::', engine = 'python',
names = ['사용자아이디', '성별','연령','직업','지역'])
users.head()
사용자아이디 | 성별 | 연령 | 직업 | 지역 | |
---|---|---|---|---|---|
0 | 1 | F | 1 | 10 | 48067 |
1 | 2 | M | 56 | 16 | 70072 |
2 | 3 | M | 25 | 15 | 55117 |
3 | 4 | M | 45 | 7 | 02460 |
4 | 5 | M | 25 | 20 | 55455 |
# 평점 데이터 읽어오기
ratings = pd.read_csv('data/movielens/ratings.dat', sep = '::', engine = 'python',
names = ['사용자아이디', '영화아이디','평점','타임스탬프'])
ratings.head()
사용자아이디 | 영화아이디 | 평점 | 타임스탬프 | |
---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 |
1 | 1 | 661 | 3 | 978302109 |
2 | 1 | 914 | 3 | 978301968 |
3 | 1 | 3408 | 4 | 978300275 |
4 | 1 | 2355 | 5 | 978824291 |
# 영화데이터 읽어오기
movies = pd.read_csv('data/movielens/movies.dat', sep = '::', engine = 'python',
names = ['영화아이디','영화제목','장르'], encoding = 'latin-1')
movies.head()
영화아이디 | 영화제목 | 장르 | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Animation|Children's|Comedy |
1 | 2 | Jumanji (1995) | Adventure|Children's|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
3 | 4 | Waiting to Exhale (1995) | Comedy|Drama |
4 | 5 | Father of the Bride Part II (1995) | Comedy |
# 3개의 데이터프레임을 하나로 합치기
data = pd.merge(users, ratings)
data = pd.merge(data, movies)
data.head()
사용자아이디 | 성별 | 연령 | 직업 | 지역 | 영화아이디 | 평점 | 타임스탬프 | 영화제목 | 장르 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | F | 1 | 10 | 48067 | 1193 | 5 | 978300760 | One Flew Over the Cuckoo's Nest (1975) | Drama |
1 | 2 | M | 56 | 16 | 70072 | 1193 | 5 | 978298413 | One Flew Over the Cuckoo's Nest (1975) | Drama |
2 | 12 | M | 25 | 12 | 32793 | 1193 | 4 | 978220179 | One Flew Over the Cuckoo's Nest (1975) | Drama |
3 | 15 | M | 25 | 7 | 22903 | 1193 | 4 | 978199279 | One Flew Over the Cuckoo's Nest (1975) | Drama |
4 | 17 | M | 50 | 1 | 95350 | 1193 | 5 | 978158471 | One Flew Over the Cuckoo's Nest (1975) | Drama |
print('사용자 수:', len(users))
print('리뷰 수:', len(ratings)) #ratings.사용자아이디.count()
print('영화 수:', len(movies))
사용자 수: 6040
리뷰 수: 1000209
영화 수: 3883
# 모든 사용자가 리뷰를 했나?
ratings.사용자아이디.nunique() #모든 사용자가 영화 리뷰에 참여!
6040
# 리뷰가 없는 영화는 있는가?
ratings.nunique() #영화 수는 3883, 영화 아이디는 3706으로, 176개는 영화 리뷰가 없음!
사용자아이디 6040
영화아이디 3706
평점 5
타임스탬프 458455
dtype: int64
2. 보고 싶은 영화 찾기
영화들의 평점 평균을 구하여, 사람들에게 인정받는 (평점이 높은) 영화 찾기
data.head()
사용자아이디 | 성별 | 연령 | 직업 | 지역 | 영화아이디 | 평점 | 타임스탬프 | 영화제목 | 장르 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | F | 1 | 10 | 48067 | 1193 | 5 | 978300760 | One Flew Over the Cuckoo's Nest (1975) | Drama |
1 | 2 | M | 56 | 16 | 70072 | 1193 | 5 | 978298413 | One Flew Over the Cuckoo's Nest (1975) | Drama |
2 | 12 | M | 25 | 12 | 32793 | 1193 | 4 | 978220179 | One Flew Over the Cuckoo's Nest (1975) | Drama |
3 | 15 | M | 25 | 7 | 22903 | 1193 | 4 | 978199279 | One Flew Over the Cuckoo's Nest (1975) | Drama |
4 | 17 | M | 50 | 1 | 95350 | 1193 | 5 | 978158471 | One Flew Over the Cuckoo's Nest (1975) | Drama |
# 영화들의 평점 평균을 구하여, 평점이 높은 영화 찾기
data.pivot_table(index='영화제목', aggfunc='mean', values='평점')\
.sort_values(by='평점', ascending=False).head(10)
평점 | |
---|---|
영화제목 | |
Ulysses (Ulisse) (1954) | 5.0 |
Lured (1947) | 5.0 |
Follow the Bitch (1998) | 5.0 |
Bittersweet Motel (2000) | 5.0 |
Song of Freedom (1936) | 5.0 |
One Little Indian (1973) | 5.0 |
Smashing Time (1967) | 5.0 |
Schlafes Bruder (Brother of Sleep) (1995) | 5.0 |
Gate of Heavenly Peace, The (1995) | 5.0 |
Baby, The (1973) | 5.0 |
data.pivot_table(index='영화제목', aggfunc='mean', values='평점').nlargest(10, '평점') #nsmallest
평점 | |
---|---|
영화제목 | |
Baby, The (1973) | 5.0 |
Bittersweet Motel (2000) | 5.0 |
Follow the Bitch (1998) | 5.0 |
Gate of Heavenly Peace, The (1995) | 5.0 |
Lured (1947) | 5.0 |
One Little Indian (1973) | 5.0 |
Schlafes Bruder (Brother of Sleep) (1995) | 5.0 |
Smashing Time (1967) | 5.0 |
Song of Freedom (1936) | 5.0 |
Ulysses (Ulisse) (1954) | 5.0 |
평균 평점이 만점인 영화들이 최상위에 위치함. 일반적으로 평점이 만점인 경우는 대부분 평점의 개수가 매우 적은 경우이므로, 이를 확인하기 위해 평점의 개수도 함께 구해본다.
#영화제목도 중복될 수 있으므로, 제대로 하려면 영화아이디로 해야 됨
# 중복된 영화제목이 있는지 확인
movies.영화제목.nunique() #위에서 구한 영화 수와 일치하므로 중복X
3883
movies.nunique()
영화아이디 3883
영화제목 3883
장르 301
dtype: int64
#data.pivot_table(index=['영화아이디','영화제목'], aggfunc='mean', values='평점')\
# .nlargest(10, '평점') #nsmallest
data.pivot_table(index='영화제목', aggfunc=['mean','count'], values='평점')
mean | count | |
---|---|---|
평점 | 평점 | |
영화제목 | ||
$1,000,000 Duck (1971) | 3.027027 | 37 |
'Night Mother (1986) | 3.371429 | 70 |
'Til There Was You (1997) | 2.692308 | 52 |
'burbs, The (1989) | 2.910891 | 303 |
...And Justice for All (1979) | 3.713568 | 199 |
... | ... | ... |
Zed & Two Noughts, A (1985) | 3.413793 | 29 |
Zero Effect (1998) | 3.750831 | 301 |
Zero Kelvin (Kjærlighetens kjøtere) (1995) | 3.500000 | 2 |
Zeus and Roxanne (1997) | 2.521739 | 23 |
eXistenZ (1999) | 3.256098 | 410 |
3706 rows × 2 columns
data.pivot_table(index='영화제목', aggfunc=['mean','count'], values='평점')\
.nlargest(10, ('mean','평점')) #평점 높고, 개수도 많은 걸 찾자!
mean | count | |
---|---|---|
평점 | 평점 | |
영화제목 | ||
Baby, The (1973) | 5.0 | 1 |
Bittersweet Motel (2000) | 5.0 | 1 |
Follow the Bitch (1998) | 5.0 | 1 |
Gate of Heavenly Peace, The (1995) | 5.0 | 3 |
Lured (1947) | 5.0 | 1 |
One Little Indian (1973) | 5.0 | 1 |
Schlafes Bruder (Brother of Sleep) (1995) | 5.0 | 1 |
Smashing Time (1967) | 5.0 | 2 |
Song of Freedom (1936) | 5.0 | 1 |
Ulysses (Ulisse) (1954) | 5.0 | 1 |
#평점 평균이 4.5 이상이고, 평점의 개수가 1000개 이상인 영화를 보고싶은 영화로 지정!
영화평점 = data.pivot_table(index='영화제목', aggfunc=['mean','count'], values='평점')
영화평점.columns=['평점평균','평점개수']
영화평점
평점평균 | 평점개수 | |
---|---|---|
영화제목 | ||
$1,000,000 Duck (1971) | 3.027027 | 37 |
'Night Mother (1986) | 3.371429 | 70 |
'Til There Was You (1997) | 2.692308 | 52 |
'burbs, The (1989) | 2.910891 | 303 |
...And Justice for All (1979) | 3.713568 | 199 |
... | ... | ... |
Zed & Two Noughts, A (1985) | 3.413793 | 29 |
Zero Effect (1998) | 3.750831 | 301 |
Zero Kelvin (Kjærlighetens kjøtere) (1995) | 3.500000 | 2 |
Zeus and Roxanne (1997) | 2.521739 | 23 |
eXistenZ (1999) | 3.256098 | 410 |
3706 rows × 2 columns
영화평점[(영화평점.평점평균 >= 4.5) & (영화평점.평점개수 >=1000)]
평점평균 | 평점개수 | |
---|---|---|
영화제목 | ||
Godfather, The (1972) | 4.524966 | 2223 |
Schindler's List (1993) | 4.510417 | 2304 |
Shawshank Redemption, The (1994) | 4.554558 | 2227 |
Usual Suspects, The (1995) | 4.517106 | 1783 |
[실습 #1] 여자들이 좋아하는 영화 찾기
- 여성 평점이 4.0 이상이고 여성 평점의 개수가 500개 이상인 영화
data_f = data[data['성별']=='F']
data_f
사용자아이디 | 성별 | 연령 | 직업 | 지역 | 영화아이디 | 평점 | 타임스탬프 | 영화제목 | 장르 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | F | 1 | 10 | 48067 | 1193 | 5 | 978300760 | One Flew Over the Cuckoo's Nest (1975) | Drama |
5 | 18 | F | 18 | 3 | 95825 | 1193 | 4 | 978156168 | One Flew Over the Cuckoo's Nest (1975) | Drama |
7 | 24 | F | 25 | 7 | 10023 | 1193 | 5 | 978136709 | One Flew Over the Cuckoo's Nest (1975) | Drama |
8 | 28 | F | 25 | 1 | 14607 | 1193 | 3 | 978125194 | One Flew Over the Cuckoo's Nest (1975) | Drama |
19 | 59 | F | 50 | 1 | 55413 | 1193 | 4 | 977934292 | One Flew Over the Cuckoo's Nest (1975) | Drama |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1000199 | 5334 | F | 56 | 13 | 46140 | 3382 | 5 | 960796159 | Song of Freedom (1936) | Drama |
1000200 | 5420 | F | 1 | 19 | 14850 | 1843 | 3 | 960156505 | Slappy and the Stinkers (1998) | Children's|Comedy |
1000201 | 5433 | F | 35 | 17 | 45014 | 286 | 3 | 960240881 | Nemesis 2: Nebula (1995) | Action|Sci-Fi|Thriller |
1000202 | 5494 | F | 35 | 17 | 94306 | 3530 | 4 | 959816296 | Smoking/No Smoking (1993) | Comedy |
1000207 | 5851 | F | 18 | 20 | 55410 | 3607 | 5 | 957756608 | One Little Indian (1973) | Comedy|Drama|Western |
246440 rows × 10 columns
여성평점 = data_f.pivot_table(index='영화제목', aggfunc=['mean','count'], values='평점')
여성평점.columns=['평점평균','평점개수']
여성평점[(여성평점.평점평균 >= 4.0) & (여성평점.평점개수 >=500)]
평점평균 | 평점개수 | |
---|---|---|
영화제목 | ||
American Beauty (1999) | 4.238901 | 946 |
Being John Malkovich (1999) | 4.159930 | 569 |
Braveheart (1995) | 4.016484 | 546 |
Casablanca (1942) | 4.300990 | 505 |
E.T. the Extra-Terrestrial (1982) | 4.089850 | 601 |
Fargo (1996) | 4.217656 | 657 |
Forrest Gump (1994) | 4.045031 | 644 |
L.A. Confidential (1997) | 4.106007 | 566 |
Matrix, The (1999) | 4.128405 | 514 |
Princess Bride, The (1987) | 4.342767 | 636 |
Pulp Fiction (1994) | 4.071956 | 542 |
Raiders of the Lost Ark (1981) | 4.332168 | 572 |
Saving Private Ryan (1998) | 4.114783 | 575 |
Schindler's List (1993) | 4.562602 | 615 |
Shakespeare in Love (1998) | 4.181704 | 798 |
Shawshank Redemption, The (1994) | 4.539075 | 627 |
Silence of the Lambs, The (1991) | 4.271955 | 706 |
Sixth Sense, The (1999) | 4.477410 | 664 |
Star Wars: Episode IV - A New Hope (1977) | 4.302937 | 647 |
Star Wars: Episode V - The Empire Strikes Back (1980) | 4.106481 | 648 |
Toy Story (1995) | 4.187817 | 591 |
Wizard of Oz, The (1939) | 4.355030 | 507 |
data[data['성별']=='F'].pivot_table(index='영화제목',
aggfunc=['mean','count'], values='평점')
mean | count | |
---|---|---|
평점 | 평점 | |
영화제목 | ||
$1,000,000 Duck (1971) | 3.375000 | 16 |
'Night Mother (1986) | 3.388889 | 36 |
'Til There Was You (1997) | 2.675676 | 37 |
'burbs, The (1989) | 2.793478 | 92 |
...And Justice for All (1979) | 3.828571 | 35 |
... | ... | ... |
Your Friends and Neighbors (1998) | 2.888889 | 27 |
Zed & Two Noughts, A (1985) | 3.500000 | 8 |
Zero Effect (1998) | 3.864407 | 59 |
Zeus and Roxanne (1997) | 2.777778 | 9 |
eXistenZ (1999) | 3.098592 | 71 |
3481 rows × 2 columns
ex1 = data.pivot_table(index='영화제목', columns='성별',
aggfunc=['mean','count'], values='평점')
ex1
mean | count | |||
---|---|---|---|---|
성별 | F | M | F | M |
영화제목 | ||||
$1,000,000 Duck (1971) | 3.375000 | 2.761905 | 16.0 | 21.0 |
'Night Mother (1986) | 3.388889 | 3.352941 | 36.0 | 34.0 |
'Til There Was You (1997) | 2.675676 | 2.733333 | 37.0 | 15.0 |
'burbs, The (1989) | 2.793478 | 2.962085 | 92.0 | 211.0 |
...And Justice for All (1979) | 3.828571 | 3.689024 | 35.0 | 164.0 |
... | ... | ... | ... | ... |
Zed & Two Noughts, A (1985) | 3.500000 | 3.380952 | 8.0 | 21.0 |
Zero Effect (1998) | 3.864407 | 3.723140 | 59.0 | 242.0 |
Zero Kelvin (Kjærlighetens kjøtere) (1995) | NaN | 3.500000 | NaN | 2.0 |
Zeus and Roxanne (1997) | 2.777778 | 2.357143 | 9.0 | 14.0 |
eXistenZ (1999) | 3.098592 | 3.289086 | 71.0 | 339.0 |
3706 rows × 4 columns
ex1[(ex1[('mean','F')]>= 4.0) & (ex1[('count','F')] >=500)]
mean | count | |||
---|---|---|---|---|
성별 | F | M | F | M |
영화제목 | ||||
American Beauty (1999) | 4.238901 | 4.347301 | 946.0 | 2482.0 |
Being John Malkovich (1999) | 4.159930 | 4.113636 | 569.0 | 1672.0 |
Braveheart (1995) | 4.016484 | 4.297839 | 546.0 | 1897.0 |
Casablanca (1942) | 4.300990 | 4.461340 | 505.0 | 1164.0 |
E.T. the Extra-Terrestrial (1982) | 4.089850 | 3.920264 | 601.0 | 1668.0 |
Fargo (1996) | 4.217656 | 4.267780 | 657.0 | 1856.0 |
Forrest Gump (1994) | 4.045031 | 4.105806 | 644.0 | 1550.0 |
L.A. Confidential (1997) | 4.106007 | 4.256678 | 566.0 | 1722.0 |
Matrix, The (1999) | 4.128405 | 4.362235 | 514.0 | 2076.0 |
Princess Bride, The (1987) | 4.342767 | 4.288942 | 636.0 | 1682.0 |
Pulp Fiction (1994) | 4.071956 | 4.346839 | 542.0 | 1629.0 |
Raiders of the Lost Ark (1981) | 4.332168 | 4.520597 | 572.0 | 1942.0 |
Saving Private Ryan (1998) | 4.114783 | 4.398941 | 575.0 | 2078.0 |
Schindler's List (1993) | 4.562602 | 4.491415 | 615.0 | 1689.0 |
Shakespeare in Love (1998) | 4.181704 | 4.099936 | 798.0 | 1571.0 |
Shawshank Redemption, The (1994) | 4.539075 | 4.560625 | 627.0 | 1600.0 |
Silence of the Lambs, The (1991) | 4.271955 | 4.381944 | 706.0 | 1872.0 |
Sixth Sense, The (1999) | 4.477410 | 4.379944 | 664.0 | 1795.0 |
Star Wars: Episode IV - A New Hope (1977) | 4.302937 | 4.495307 | 647.0 | 2344.0 |
Star Wars: Episode V - The Empire Strikes Back (1980) | 4.106481 | 4.344577 | 648.0 | 2342.0 |
Toy Story (1995) | 4.187817 | 4.130552 | 591.0 | 1486.0 |
Wizard of Oz, The (1939) | 4.355030 | 4.203138 | 507.0 | 1211.0 |
[실습 #2] 실습 #1에서 구한 영화(여성인기영화)의 장르를 분석해 보자.
여성인기영화의 장르 통계 구하기
예를 들어, 여성인기영화 중 Drama 장르의 영화는 10개, Action 영화는 3개, …
여성인기영화 = 여성평점[(여성평점.평점평균 >= 4.0) & (여성평점.평점개수 >=500)]
여성인기영화
평점평균 | 평점개수 | |
---|---|---|
영화제목 | ||
American Beauty (1999) | 4.238901 | 946 |
Being John Malkovich (1999) | 4.159930 | 569 |
Braveheart (1995) | 4.016484 | 546 |
Casablanca (1942) | 4.300990 | 505 |
E.T. the Extra-Terrestrial (1982) | 4.089850 | 601 |
Fargo (1996) | 4.217656 | 657 |
Forrest Gump (1994) | 4.045031 | 644 |
L.A. Confidential (1997) | 4.106007 | 566 |
Matrix, The (1999) | 4.128405 | 514 |
Princess Bride, The (1987) | 4.342767 | 636 |
Pulp Fiction (1994) | 4.071956 | 542 |
Raiders of the Lost Ark (1981) | 4.332168 | 572 |
Saving Private Ryan (1998) | 4.114783 | 575 |
Schindler's List (1993) | 4.562602 | 615 |
Shakespeare in Love (1998) | 4.181704 | 798 |
Shawshank Redemption, The (1994) | 4.539075 | 627 |
Silence of the Lambs, The (1991) | 4.271955 | 706 |
Sixth Sense, The (1999) | 4.477410 | 664 |
Star Wars: Episode IV - A New Hope (1977) | 4.302937 | 647 |
Star Wars: Episode V - The Empire Strikes Back (1980) | 4.106481 | 648 |
Toy Story (1995) | 4.187817 | 591 |
Wizard of Oz, The (1939) | 4.355030 | 507 |
여성인기영화.index
Index(['American Beauty (1999)', 'Being John Malkovich (1999)',
'Braveheart (1995)', 'Casablanca (1942)',
'E.T. the Extra-Terrestrial (1982)', 'Fargo (1996)',
'Forrest Gump (1994)', 'L.A. Confidential (1997)', 'Matrix, The (1999)',
'Princess Bride, The (1987)', 'Pulp Fiction (1994)',
'Raiders of the Lost Ark (1981)', 'Saving Private Ryan (1998)',
'Schindler's List (1993)', 'Shakespeare in Love (1998)',
'Shawshank Redemption, The (1994)', 'Silence of the Lambs, The (1991)',
'Sixth Sense, The (1999)', 'Star Wars: Episode IV - A New Hope (1977)',
'Star Wars: Episode V - The Empire Strikes Back (1980)',
'Toy Story (1995)', 'Wizard of Oz, The (1939)'],
dtype='object', name='영화제목')
movies.영화제목.values
array(['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)',
..., 'Tigerland (2000)', 'Two Family House (2000)',
'Contender, The (2000)'], dtype=object)
lst=[]
for i in 여성인기영화.index:
if i in movies.영화제목.values:
lst.append(i)
lst
['American Beauty (1999)',
'Being John Malkovich (1999)',
'Braveheart (1995)',
'Casablanca (1942)',
'E.T. the Extra-Terrestrial (1982)',
'Fargo (1996)',
'Forrest Gump (1994)',
'L.A. Confidential (1997)',
'Matrix, The (1999)',
'Princess Bride, The (1987)',
'Pulp Fiction (1994)',
'Raiders of the Lost Ark (1981)',
'Saving Private Ryan (1998)',
"Schindler's List (1993)",
'Shakespeare in Love (1998)',
'Shawshank Redemption, The (1994)',
'Silence of the Lambs, The (1991)',
'Sixth Sense, The (1999)',
'Star Wars: Episode IV - A New Hope (1977)',
'Star Wars: Episode V - The Empire Strikes Back (1980)',
'Toy Story (1995)',
'Wizard of Oz, The (1939)']
여성인기영화장르 = movies[movies.영화제목.isin(lst)]
여성인기영화장르
영화아이디 | 영화제목 | 장르 | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Animation|Children's|Comedy |
108 | 110 | Braveheart (1995) | Action|Drama|War |
257 | 260 | Star Wars: Episode IV - A New Hope (1977) | Action|Adventure|Fantasy|Sci-Fi |
293 | 296 | Pulp Fiction (1994) | Crime|Drama |
315 | 318 | Shawshank Redemption, The (1994) | Drama |
352 | 356 | Forrest Gump (1994) | Comedy|Romance|War |
523 | 527 | Schindler's List (1993) | Drama|War |
589 | 593 | Silence of the Lambs, The (1991) | Drama|Thriller |
604 | 608 | Fargo (1996) | Crime|Drama|Thriller |
900 | 912 | Casablanca (1942) | Drama|Romance|War |
907 | 919 | Wizard of Oz, The (1939) | Adventure|Children's|Drama|Musical |
1081 | 1097 | E.T. the Extra-Terrestrial (1982) | Children's|Drama|Fantasy|Sci-Fi |
1178 | 1196 | Star Wars: Episode V - The Empire Strikes Back... | Action|Adventure|Drama|Sci-Fi|War |
1179 | 1197 | Princess Bride, The (1987) | Action|Adventure|Comedy|Romance |
1180 | 1198 | Raiders of the Lost Ark (1981) | Action|Adventure |
1575 | 1617 | L.A. Confidential (1997) | Crime|Film-Noir|Mystery|Thriller |
1959 | 2028 | Saving Private Ryan (1998) | Action|Drama|War |
2327 | 2396 | Shakespeare in Love (1998) | Comedy|Romance |
2502 | 2571 | Matrix, The (1999) | Action|Sci-Fi|Thriller |
2693 | 2762 | Sixth Sense, The (1999) | Thriller |
2789 | 2858 | American Beauty (1999) | Comedy|Drama |
2928 | 2997 | Being John Malkovich (1999) | Comedy |
k =[]
for i in 여성인기영화장르.장르:
for j in i.split('|'):
k.append(j)
k
['Animation',
"Children's",
'Comedy',
'Action',
'Drama',
'War',
'Action',
'Adventure',
'Fantasy',
'Sci-Fi',
'Crime',
'Drama',
'Drama',
'Comedy',
'Romance',
'War',
'Drama',
'War',
'Drama',
'Thriller',
'Crime',
'Drama',
'Thriller',
'Drama',
'Romance',
'War',
'Adventure',
"Children's",
'Drama',
'Musical',
"Children's",
'Drama',
'Fantasy',
'Sci-Fi',
'Action',
'Adventure',
'Drama',
'Sci-Fi',
'War',
'Action',
'Adventure',
'Comedy',
'Romance',
'Action',
'Adventure',
'Crime',
'Film-Noir',
'Mystery',
'Thriller',
'Action',
'Drama',
'War',
'Comedy',
'Romance',
'Action',
'Sci-Fi',
'Thriller',
'Thriller',
'Comedy',
'Drama',
'Comedy']
count={}
for i in k:
try: count[i] += 1
except: count[i]=1
print(count)
{'Animation': 1, "Children's": 3, 'Comedy': 6, 'Action': 7, 'Drama': 12, 'War': 6, 'Adventure': 5, 'Fantasy': 2, 'Sci-Fi': 4, 'Crime': 3, 'Romance': 4, 'Thriller': 5, 'Musical': 1, 'Film-Noir': 1, 'Mystery': 1}
ex2 = ex1[(ex1[('mean','F')]>= 4.0) & (ex1[('count','F')] >=500)].index
ex2
Index(['American Beauty (1999)', 'Being John Malkovich (1999)',
'Braveheart (1995)', 'Casablanca (1942)',
'E.T. the Extra-Terrestrial (1982)', 'Fargo (1996)',
'Forrest Gump (1994)', 'L.A. Confidential (1997)', 'Matrix, The (1999)',
'Princess Bride, The (1987)', 'Pulp Fiction (1994)',
'Raiders of the Lost Ark (1981)', 'Saving Private Ryan (1998)',
'Schindler's List (1993)', 'Shakespeare in Love (1998)',
'Shawshank Redemption, The (1994)', 'Silence of the Lambs, The (1991)',
'Sixth Sense, The (1999)', 'Star Wars: Episode IV - A New Hope (1977)',
'Star Wars: Episode V - The Empire Strikes Back (1980)',
'Toy Story (1995)', 'Wizard of Oz, The (1939)'],
dtype='object', name='영화제목')
ex2_df = movies[movies.영화제목.isin(ex2)]
ex2_df
영화아이디 | 영화제목 | 장르 | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Animation|Children's|Comedy |
108 | 110 | Braveheart (1995) | Action|Drama|War |
257 | 260 | Star Wars: Episode IV - A New Hope (1977) | Action|Adventure|Fantasy|Sci-Fi |
293 | 296 | Pulp Fiction (1994) | Crime|Drama |
315 | 318 | Shawshank Redemption, The (1994) | Drama |
352 | 356 | Forrest Gump (1994) | Comedy|Romance|War |
523 | 527 | Schindler's List (1993) | Drama|War |
589 | 593 | Silence of the Lambs, The (1991) | Drama|Thriller |
604 | 608 | Fargo (1996) | Crime|Drama|Thriller |
900 | 912 | Casablanca (1942) | Drama|Romance|War |
907 | 919 | Wizard of Oz, The (1939) | Adventure|Children's|Drama|Musical |
1081 | 1097 | E.T. the Extra-Terrestrial (1982) | Children's|Drama|Fantasy|Sci-Fi |
1178 | 1196 | Star Wars: Episode V - The Empire Strikes Back... | Action|Adventure|Drama|Sci-Fi|War |
1179 | 1197 | Princess Bride, The (1987) | Action|Adventure|Comedy|Romance |
1180 | 1198 | Raiders of the Lost Ark (1981) | Action|Adventure |
1575 | 1617 | L.A. Confidential (1997) | Crime|Film-Noir|Mystery|Thriller |
1959 | 2028 | Saving Private Ryan (1998) | Action|Drama|War |
2327 | 2396 | Shakespeare in Love (1998) | Comedy|Romance |
2502 | 2571 | Matrix, The (1999) | Action|Sci-Fi|Thriller |
2693 | 2762 | Sixth Sense, The (1999) | Thriller |
2789 | 2858 | American Beauty (1999) | Comedy|Drama |
2928 | 2997 | Being John Malkovich (1999) | Comedy |
ex2_df.장르.str.split('|')
0 [Animation, Children's, Comedy]
108 [Action, Drama, War]
257 [Action, Adventure, Fantasy, Sci-Fi]
293 [Crime, Drama]
315 [Drama]
352 [Comedy, Romance, War]
523 [Drama, War]
589 [Drama, Thriller]
604 [Crime, Drama, Thriller]
900 [Drama, Romance, War]
907 [Adventure, Children's, Drama, Musical]
1081 [Children's, Drama, Fantasy, Sci-Fi]
1178 [Action, Adventure, Drama, Sci-Fi, War]
1179 [Action, Adventure, Comedy, Romance]
1180 [Action, Adventure]
1575 [Crime, Film-Noir, Mystery, Thriller]
1959 [Action, Drama, War]
2327 [Comedy, Romance]
2502 [Action, Sci-Fi, Thriller]
2693 [Thriller]
2789 [Comedy, Drama]
2928 [Comedy]
Name: 장르, dtype: object
ex2_df = ex2_df.장르.str.split('|', expand=True)
ex2_df
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | Animation | Children's | Comedy | None | None |
108 | Action | Drama | War | None | None |
257 | Action | Adventure | Fantasy | Sci-Fi | None |
293 | Crime | Drama | None | None | None |
315 | Drama | None | None | None | None |
352 | Comedy | Romance | War | None | None |
523 | Drama | War | None | None | None |
589 | Drama | Thriller | None | None | None |
604 | Crime | Drama | Thriller | None | None |
900 | Drama | Romance | War | None | None |
907 | Adventure | Children's | Drama | Musical | None |
1081 | Children's | Drama | Fantasy | Sci-Fi | None |
1178 | Action | Adventure | Drama | Sci-Fi | War |
1179 | Action | Adventure | Comedy | Romance | None |
1180 | Action | Adventure | None | None | None |
1575 | Crime | Film-Noir | Mystery | Thriller | None |
1959 | Action | Drama | War | None | None |
2327 | Comedy | Romance | None | None | None |
2502 | Action | Sci-Fi | Thriller | None | None |
2693 | Thriller | None | None | None | None |
2789 | Comedy | Drama | None | None | None |
2928 | Comedy | None | None | None | None |
장르1 = ex2_df[0].value_counts()
장르2 = ex2_df[1].value_counts()
장르3 = ex2_df[2].value_counts()
장르4 = ex2_df[3].value_counts()
장르5 = ex2_df[4].value_counts()
#장르1+장르2+장르3+장르4+장르5
장르1.add(장르2, fill_value=0).add(장르3, fill_value=0).add(장르4, fill_value=0).add(장르5, fill_value=0)
Action 7.0
Adventure 5.0
Animation 1.0
Children's 3.0
Comedy 6.0
Crime 3.0
Drama 12.0
Fantasy 2.0
Film-Noir 1.0
Musical 1.0
Mystery 1.0
Romance 4.0
Sci-Fi 4.0
Thriller 5.0
War 6.0
dtype: float64
sr = Series()
for col in ex2_df.columns:
sr = sr.add(ex2_df[col].value_counts(), fill_value=0)
sr
C:\anaconda\envs\test3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
"""Entry point for launching an IPython kernel.
Action 7.0
Adventure 5.0
Animation 1.0
Children's 3.0
Comedy 6.0
Crime 3.0
Drama 12.0
Fantasy 2.0
Film-Noir 1.0
Musical 1.0
Mystery 1.0
Romance 4.0
Sci-Fi 4.0
Thriller 5.0
War 6.0
dtype: float64
sr.sort_values(ascending=False)
Drama 12.0
Action 7.0
Comedy 6.0
War 6.0
Adventure 5.0
Thriller 5.0
Romance 4.0
Sci-Fi 4.0
Children's 3.0
Crime 3.0
Fantasy 2.0
Animation 1.0
Film-Noir 1.0
Musical 1.0
Mystery 1.0
dtype: float64
[실습 #3] 남자와 여자의 호불호가 크게 갈리는 영화 10개 찾기
전체 평점의 개수가 500개 이상인 영화만 대상으로 함.
data_f = data[data['성별']=='F']
data_f
사용자아이디 | 성별 | 연령 | 직업 | 지역 | 영화아이디 | 평점 | 타임스탬프 | 영화제목 | 장르 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | F | 1 | 10 | 48067 | 1193 | 5 | 978300760 | One Flew Over the Cuckoo's Nest (1975) | Drama |
5 | 18 | F | 18 | 3 | 95825 | 1193 | 4 | 978156168 | One Flew Over the Cuckoo's Nest (1975) | Drama |
7 | 24 | F | 25 | 7 | 10023 | 1193 | 5 | 978136709 | One Flew Over the Cuckoo's Nest (1975) | Drama |
8 | 28 | F | 25 | 1 | 14607 | 1193 | 3 | 978125194 | One Flew Over the Cuckoo's Nest (1975) | Drama |
19 | 59 | F | 50 | 1 | 55413 | 1193 | 4 | 977934292 | One Flew Over the Cuckoo's Nest (1975) | Drama |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1000199 | 5334 | F | 56 | 13 | 46140 | 3382 | 5 | 960796159 | Song of Freedom (1936) | Drama |
1000200 | 5420 | F | 1 | 19 | 14850 | 1843 | 3 | 960156505 | Slappy and the Stinkers (1998) | Children's|Comedy |
1000201 | 5433 | F | 35 | 17 | 45014 | 286 | 3 | 960240881 | Nemesis 2: Nebula (1995) | Action|Sci-Fi|Thriller |
1000202 | 5494 | F | 35 | 17 | 94306 | 3530 | 4 | 959816296 | Smoking/No Smoking (1993) | Comedy |
1000207 | 5851 | F | 18 | 20 | 55410 | 3607 | 5 | 957756608 | One Little Indian (1973) | Comedy|Drama|Western |
246440 rows × 10 columns
여성평점 = data_f.pivot_table(index='영화제목', aggfunc=['mean','count'], values='평점')
여성평점.columns=['평점평균_f','평점개수_f']
여성평점 = 여성평점[여성평점.평점개수_f >=500]
여성평점
평점평균_f | 평점개수_f | |
---|---|---|
영화제목 | ||
American Beauty (1999) | 4.238901 | 946 |
Babe (1995) | 3.953368 | 579 |
Back to the Future (1985) | 3.932707 | 639 |
Being John Malkovich (1999) | 4.159930 | 569 |
Braveheart (1995) | 4.016484 | 546 |
Casablanca (1942) | 4.300990 | 505 |
E.T. the Extra-Terrestrial (1982) | 4.089850 | 601 |
Fargo (1996) | 4.217656 | 657 |
Forrest Gump (1994) | 4.045031 | 644 |
Ghostbusters (1984) | 3.833962 | 530 |
Groundhog Day (1993) | 3.735562 | 658 |
Jurassic Park (1993) | 3.579407 | 573 |
L.A. Confidential (1997) | 4.106007 | 566 |
Matrix, The (1999) | 4.128405 | 514 |
Men in Black (1997) | 3.817844 | 538 |
Princess Bride, The (1987) | 4.342767 | 636 |
Pulp Fiction (1994) | 4.071956 | 542 |
Raiders of the Lost Ark (1981) | 4.332168 | 572 |
Saving Private Ryan (1998) | 4.114783 | 575 |
Schindler's List (1993) | 4.562602 | 615 |
Shakespeare in Love (1998) | 4.181704 | 798 |
Shawshank Redemption, The (1994) | 4.539075 | 627 |
Silence of the Lambs, The (1991) | 4.271955 | 706 |
Sixth Sense, The (1999) | 4.477410 | 664 |
Star Wars: Episode IV - A New Hope (1977) | 4.302937 | 647 |
Star Wars: Episode V - The Empire Strikes Back (1980) | 4.106481 | 648 |
Star Wars: Episode VI - Return of the Jedi (1983) | 3.865237 | 653 |
Toy Story (1995) | 4.187817 | 591 |
Wizard of Oz, The (1939) | 4.355030 | 507 |
data_m = data[data['성별']=='M']
data_m
사용자아이디 | 성별 | 연령 | 직업 | 지역 | 영화아이디 | 평점 | 타임스탬프 | 영화제목 | 장르 | |
---|---|---|---|---|---|---|---|---|---|---|
1 | 2 | M | 56 | 16 | 70072 | 1193 | 5 | 978298413 | One Flew Over the Cuckoo's Nest (1975) | Drama |
2 | 12 | M | 25 | 12 | 32793 | 1193 | 4 | 978220179 | One Flew Over the Cuckoo's Nest (1975) | Drama |
3 | 15 | M | 25 | 7 | 22903 | 1193 | 4 | 978199279 | One Flew Over the Cuckoo's Nest (1975) | Drama |
4 | 17 | M | 50 | 1 | 95350 | 1193 | 5 | 978158471 | One Flew Over the Cuckoo's Nest (1975) | Drama |
6 | 19 | M | 1 | 10 | 48073 | 1193 | 5 | 982730936 | One Flew Over the Cuckoo's Nest (1975) | Drama |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1000203 | 5556 | M | 45 | 6 | 92103 | 2198 | 3 | 959445515 | Modulations (1998) | Documentary |
1000204 | 5949 | M | 18 | 17 | 47901 | 2198 | 5 | 958846401 | Modulations (1998) | Documentary |
1000205 | 5675 | M | 35 | 14 | 30030 | 2703 | 3 | 976029116 | Broken Vessels (1998) | Drama |
1000206 | 5780 | M | 18 | 17 | 92886 | 2845 | 1 | 958153068 | White Boys (1999) | Drama |
1000208 | 5938 | M | 25 | 1 | 35401 | 2909 | 4 | 957273353 | Five Wives, Three Secretaries and Me (1998) | Documentary |
753769 rows × 10 columns
남성평점 = data_m.pivot_table(index='영화제목', aggfunc=['mean','count'], values='평점')
남성평점.columns=['평점평균_m','평점개수_m']
남성평점 = 남성평점[남성평점.평점개수_m >=500]
남성평점
평점평균_m | 평점개수_m | |
---|---|---|
영화제목 | ||
13th Warrior, The (1999) | 3.168000 | 625 |
2001: A Space Odyssey (1968) | 4.129738 | 1372 |
Abyss, The (1989) | 3.689507 | 1401 |
Ace Ventura: Pet Detective (1994) | 3.197917 | 576 |
Addams Family, The (1991) | 3.163498 | 526 |
... | ... | ... |
Wrong Trousers, The (1993) | 4.478261 | 644 |
X-Files: Fight the Future, The (1998) | 3.493797 | 806 |
X-Men (2000) | 3.851702 | 1234 |
You've Got Mail (1998) | 3.275591 | 508 |
Young Frankenstein (1974) | 4.239177 | 924 |
436 rows × 2 columns
여남평점 = pd.concat([여성평점, 남성평점],axis=1)
여남평점['평점차이'] = abs(여남평점['평점평균_f']-여남평점['평점평균_m'])
여남평점.nlargest(10, ('평점차이'))
평점평균_f | 평점개수_f | 평점평균_m | 평점개수_m | 평점차이 | |
---|---|---|---|---|---|
영화제목 | |||||
Groundhog Day (1993) | 3.735562 | 658.0 | 4.041358 | 1620 | 0.305796 |
Saving Private Ryan (1998) | 4.114783 | 575.0 | 4.398941 | 2078 | 0.284159 |
Braveheart (1995) | 4.016484 | 546.0 | 4.297839 | 1897 | 0.281355 |
Pulp Fiction (1994) | 4.071956 | 542.0 | 4.346839 | 1629 | 0.274883 |
Star Wars: Episode V - The Empire Strikes Back (1980) | 4.106481 | 648.0 | 4.344577 | 2342 | 0.238096 |
Jurassic Park (1993) | 3.579407 | 573.0 | 3.814197 | 2099 | 0.234791 |
Matrix, The (1999) | 4.128405 | 514.0 | 4.362235 | 2076 | 0.233830 |
Star Wars: Episode VI - Return of the Jedi (1983) | 3.865237 | 653.0 | 4.069058 | 2230 | 0.203821 |
Star Wars: Episode IV - A New Hope (1977) | 4.302937 | 647.0 | 4.495307 | 2344 | 0.192371 |
Raiders of the Lost Ark (1981) | 4.332168 | 572.0 | 4.520597 | 1942 | 0.188429 |
ex3 = data.pivot_table(index = '영화제목', columns= '성별',
values = '평점', aggfunc = ['mean', 'count'])
ex3 = ex3[ex3[('count','F')]+ex3[('count','M')]>=500]
ex3
mean | count | |||
---|---|---|---|---|
성별 | F | M | F | M |
영화제목 | ||||
10 Things I Hate About You (1999) | 3.646552 | 3.311966 | 232.0 | 468.0 |
101 Dalmatians (1961) | 3.791444 | 3.500000 | 187.0 | 378.0 |
12 Angry Men (1957) | 4.184397 | 4.328421 | 141.0 | 475.0 |
13th Warrior, The (1999) | 3.112000 | 3.168000 | 125.0 | 625.0 |
20,000 Leagues Under the Sea (1954) | 3.670103 | 3.709205 | 97.0 | 478.0 |
... | ... | ... | ... | ... |
X-Files: Fight the Future, The (1998) | 3.489474 | 3.493797 | 190.0 | 806.0 |
X-Men (2000) | 3.682310 | 3.851702 | 277.0 | 1234.0 |
You've Got Mail (1998) | 3.542424 | 3.275591 | 330.0 | 508.0 |
Young Frankenstein (1974) | 4.289963 | 4.239177 | 269.0 | 924.0 |
Young Guns (1988) | 3.371795 | 3.425620 | 78.0 | 484.0 |
618 rows × 4 columns
ex3[ex3['count'].sum(axis=1) >= 500]
mean | count | |||
---|---|---|---|---|
성별 | F | M | F | M |
영화제목 | ||||
10 Things I Hate About You (1999) | 3.646552 | 3.311966 | 232.0 | 468.0 |
101 Dalmatians (1961) | 3.791444 | 3.500000 | 187.0 | 378.0 |
12 Angry Men (1957) | 4.184397 | 4.328421 | 141.0 | 475.0 |
13th Warrior, The (1999) | 3.112000 | 3.168000 | 125.0 | 625.0 |
20,000 Leagues Under the Sea (1954) | 3.670103 | 3.709205 | 97.0 | 478.0 |
... | ... | ... | ... | ... |
X-Files: Fight the Future, The (1998) | 3.489474 | 3.493797 | 190.0 | 806.0 |
X-Men (2000) | 3.682310 | 3.851702 | 277.0 | 1234.0 |
You've Got Mail (1998) | 3.542424 | 3.275591 | 330.0 | 508.0 |
Young Frankenstein (1974) | 4.289963 | 4.239177 | 269.0 | 924.0 |
Young Guns (1988) | 3.371795 | 3.425620 | 78.0 | 484.0 |
618 rows × 4 columns
abs(ex3[('mean','F')] - ex3[('mean','M')]).nlargest(10)
영화제목
Dirty Dancing (1987) 0.830782
Good, The Bad and The Ugly, The (1966) 0.726351
Dumb & Dumber (1994) 0.638608
Evil Dead II (Dead By Dawn) (1987) 0.611985
Grease (1978) 0.608224
Caddyshack (1980) 0.573602
Animal House (1978) 0.538286
Exorcist, The (1973) 0.529605
Rocky Horror Picture Show, The (1975) 0.512885
Big Trouble in Little China (1986) 0.497078
dtype: float64
[실습 #4] 연령대 별로 영화 평점 분석하기
연령대(10대 미만, 10대, 20대, …50대) 컬럼을 추가한 후, 영화별 연령대별 영화평점 구하기
def generate_ages(x):
if x<10:
return '10대 미만'
elif x<20:
return '10대'
elif x<30:
return '20대'
elif x<40:
return '30대'
elif x<50:
return '40대'
elif x<60:
return '50대 이상'
generate_ages(56)
'50대 이상'
data['연령대'] = data.연령.apply(generate_ages)
data
사용자아이디 | 성별 | 연령 | 직업 | 지역 | 영화아이디 | 평점 | 타임스탬프 | 영화제목 | 장르 | 연령대 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | F | 1 | 10 | 48067 | 1193 | 5 | 978300760 | One Flew Over the Cuckoo's Nest (1975) | Drama | 10대 미만 |
1 | 2 | M | 56 | 16 | 70072 | 1193 | 5 | 978298413 | One Flew Over the Cuckoo's Nest (1975) | Drama | 50대 이상 |
2 | 12 | M | 25 | 12 | 32793 | 1193 | 4 | 978220179 | One Flew Over the Cuckoo's Nest (1975) | Drama | 20대 |
3 | 15 | M | 25 | 7 | 22903 | 1193 | 4 | 978199279 | One Flew Over the Cuckoo's Nest (1975) | Drama | 20대 |
4 | 17 | M | 50 | 1 | 95350 | 1193 | 5 | 978158471 | One Flew Over the Cuckoo's Nest (1975) | Drama | 50대 이상 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1000204 | 5949 | M | 18 | 17 | 47901 | 2198 | 5 | 958846401 | Modulations (1998) | Documentary | 10대 |
1000205 | 5675 | M | 35 | 14 | 30030 | 2703 | 3 | 976029116 | Broken Vessels (1998) | Drama | 30대 |
1000206 | 5780 | M | 18 | 17 | 92886 | 2845 | 1 | 958153068 | White Boys (1999) | Drama | 10대 |
1000207 | 5851 | F | 18 | 20 | 55410 | 3607 | 5 | 957756608 | One Little Indian (1973) | Comedy|Drama|Western | 10대 |
1000208 | 5938 | M | 25 | 1 | 35401 | 2909 | 4 | 957273353 | Five Wives, Three Secretaries and Me (1998) | Documentary | 20대 |
1000209 rows × 11 columns
연령평점 = data.pivot_table(index='영화제목', columns='연령대',
aggfunc=['mean'], values='평점')
연령평점
mean | ||||||
---|---|---|---|---|---|---|
연령대 | 10대 | 10대 미만 | 20대 | 30대 | 40대 | 50대 이상 |
영화제목 | ||||||
$1,000,000 Duck (1971) | 3.000000 | NaN | 3.090909 | 3.133333 | 2.000000 | 2.750000 |
'Night Mother (1986) | 4.666667 | 2.000000 | 3.423077 | 2.904762 | 3.833333 | 3.750000 |
'Til There Was You (1997) | 2.500000 | 3.500000 | 2.666667 | 2.900000 | 2.333333 | 2.600000 |
'burbs, The (1989) | 3.244444 | 4.500000 | 2.652174 | 2.818182 | 2.545455 | 3.100000 |
...And Justice for All (1979) | 3.428571 | 3.000000 | 3.724138 | 3.657143 | 4.100000 | 3.674419 |
... | ... | ... | ... | ... | ... | ... |
Zed & Two Noughts, A (1985) | 3.000000 | 1.000000 | 3.375000 | 3.777778 | 4.000000 | 3.000000 |
Zero Effect (1998) | 3.883333 | 4.125000 | 3.715278 | 3.608696 | 3.764706 | 3.769231 |
Zero Kelvin (Kjærlighetens kjøtere) (1995) | NaN | NaN | NaN | 3.500000 | NaN | NaN |
Zeus and Roxanne (1997) | 2.500000 | 1.500000 | 2.833333 | 3.500000 | 1.000000 | NaN |
eXistenZ (1999) | 3.289157 | 3.142857 | 3.234973 | 3.364865 | 3.222222 | 3.103448 |
3706 rows × 6 columns