For the National Day golden Week film market, is absolutely “Changjin Lake” the world, just a few days, the box office has broken through 3.6 billion, greatly catching up with “Wolf Warrior 2” end. And the word of mouth is quite high, cat eye score as high as 9.5, the absolute box office word of mouth double harvest ah
Here’s a visual analysis of why the movie is so popular by crawling through the cat’s eye reviews, complete with a simple box office prediction that you can’t miss
Data acquisition
The cat’s eye comment crawl, as usual, constructs the API information directly
url = "https://m.maoyan.com/mmdb/comments/movie/257706.json?v=yes&offset=30"
payload={}
headers = {
'Cookie': '_lxsdk_cuid=17c188b300d13-0ecb2e1c54bec6-a7d173c-100200-17c188b300ec8; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1633622378; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __mta = 87266087.1633622378325.1633622378325.1633622378325.1; uuid_n_v=v1; iuuid=ECBA18D0278711EC8B0DFD12EB2962D2C4A641A554EF466B9362A58679FDD6CF; webp=true; ci=55%2C%E5%8D%97%E4%BA%AC; ci=55%2C%E5%8D%97%E4%BA%AC; ci=55%2C%E5%8D%97%E4%BA%AC; featrues=[object Object]; _lxsdk=92E6A4E0278711ECAE4571A477FD49B513FE367C52044EB5A6974451969DD28A; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1633622806'.'Host': 'm.maoyan.com'.'User-Agent': 'the Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
}
response = requests.request("GET", url, headers=headers, data=payload)
print(response.json())
Copy the code
With a few lines of code, we get the following result
Once we get the data, we can parse the returned JSON data and save it locally by writing a function to save the data
def save_data_pd(data_name, list_info) :
if not os.path.exists(data_name + r'_data.csv') :# header
name = ["comment_id"."approve"."reply"."comment_time"."sureViewed"."nickName"."gender"."cityName"."userLevel"."user_id"."score"."content"]
Create a DataFrame object
file_test = pd.DataFrame(columns=name, data=list_info)
# data write
file_test.to_csv(data_name + r'_data.csv', encoding='utf-8', index=False)
else:
with open(data_name + r'_data.csv'.'a+', newline=' ', encoding='utf-8') as file_test:
# append to file
writer = csv.writer(file_test)
Write file
writer.writerows(list_info)
Copy the code
Saving data directly to Pandas saves a lot of data processing
Next, write a function that parses the JSON data
def get_data(json_comment) :
list_info = []
for data in json_comment:
approve = data["approve"]
comment_id = data["id"]
cityName = data["cityName"]
content = data["content"]
reply = data["reply"]
# Gender: 1 male, 2 female, 0 unknown
if "gender" in data:
gender = data["gender"]
else:
gender = 0
nickName = data["nickName"]
userLevel = data["userLevel"]
score = data["score"]
comment_time = data["startTime"]
sureViewed = data["sureViewed"]
user_id = data["userId"]
list_one = [comment_id, approve, reply, comment_time, sureViewed, nickName, gender, cityName, userLevel,
user_id, score, content]
list_info.append(list_one)
save_data_pd("maoyan", list_info)
Copy the code
We extracted several main information, such as the user’s nickname, comment time, city, etc
Finally, integrate the above code and construct the url to crawl
def fire() :
tmp = "https://m.maoyan.com/mmdb/comments/movie/257706.json?v=yes&offset="
payload={}
headers = {
'Cookie': '_lxsdk_cuid=17c188b300d13-0ecb2e1c54bec6-a7d173c-100200-17c188b300ec8; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1633622378; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __mta = 87266087.1633622378325.1633622378325.1633622378325.1; uuid_n_v=v1; iuuid=ECBA18D0278711EC8B0DFD12EB2962D2C4A641A554EF466B9362A58679FDD6CF; webp=true; ci=55%2C%E5%8D%97%E4%BA%AC; ci=55%2C%E5%8D%97%E4%BA%AC; ci=55%2C%E5%8D%97%E4%BA%AC; featrues=[object Object]; _lxsdk=92E6A4E0278711ECAE4571A477FD49B513FE367C52044EB5A6974451969DD28A; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1633622806'.'Host': 'm.maoyan.com'.'User-Agent': 'the Mozilla / 5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
}
for i in range(0.3000.15):
url = tmp + str(i)
print(url)
response = requests.request("GET", url, headers=headers, data=payload)
comment = response.json()
if not comment.get("hcmts") :break
hcmts = comment['hcmts']
get_data(hcmts)
cmts = comment['cmts']
get_data(cmts)
time.sleep(10)
Copy the code
The crawl process is as follows
The local data is as follows
Now we can perform the relevant visual analysis
Visual analysis
1 Data Cleaning
We first remove duplicate data by comment_id
df_new = df.drop_duplicates(['comment_id'])
Copy the code
For comments, we will delete the content in Chinese
def filter_str(desstr,restr=' ') :
# Filter characters other than Chinese
res = re.compile("[^ \ u4e00 - \ u9fa5 ^, ^. ^. ^. ^ ^ ^ 】 【 (^ ^) (^ ^" ^ "^ - ^! ^. ^. ^. ^)")
# print(desstr)
res.sub(restr, desstr)
Copy the code
2. List of likes and replies
Let’s take a look at the most liked comments first
approve_sort = df_new.sort_values(by=['approve'], ascending=False)
approve_sort = df_new.sort_values(by=['approve'], ascending=False)
x_data = approve_sort['nickName'].values.tolist()[:10]
y_data = approve_sort['approve'].values.tolist()[:10]
b = (Bar()
.add_xaxis(x_data)
.add_yaxis(' ',y_data)
.set_global_opts(title_opts = opts.TitleOpts(title='Top 10 Comments liked'))
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,position='right'))
.reversal_axis()
)
grid = Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(b, grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
Copy the code
You can see that at the top of the list is a comment written by an audience named “Qibao” with 86,027 likes
Take a look at comment responses
reply_sort = df_new.sort_values(by=['reply'], ascending=False)
x_data = reply_sort['nickName'].values.tolist()[:10]
y_data = reply_sort['reply'].values.tolist()[:10]
b = (Bar()
.add_xaxis(x_data)
.add_yaxis(' ',y_data)
.set_global_opts(title_opts = opts.TitleOpts(title='Top 10 Comments by Reply'))
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,position='right'))
.reversal_axis()
)
grid = Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(b, grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
Copy the code
“Qibao” also received the most replies. I wonder what he wrote
df_new[df_new['nickName'].str.contains('march treasure'] ['content'].values.tolist()[0]
Copy the code
Output:
'impression of a large family to watch the movie for the first time, my grandfather is volunteer, he had never went to the cinema, is also worried will not adapt, thanks to the cinema staff take care, my grandfather is all the way, I saw him sitting next to secretly wipe tears several times, I asked how the film, have been chanting "good, good, we at that time is like that, Just like that..." N Suddenly feel the long history and I unexpectedly so close, just three hours I saw is far away 70 years ago, is the war in the textbook, but also grandpa's 19 years old, is really, his youth! 'Copy the code
It’s a very thoughtful comment, and my family has experienced the battle of Jangjin Lake, so when watching the movie in the theater, I’m sure you will have a different feeling!
Of course, we can also crawl the reply information of each comment through the following interface
I.maoyan.com/apollo/apol…
Simply replace the JSON file name with the corresponding comment_id and leave the details as comment_id for those interested
Let’s take a look at the overall review data
3 Ranking by city
Let’s see which cities have the most comments
result = df_new['cityName'].value_counts()[:10].sort_values()
x_data = result.index.tolist()
y_data = result.values.tolist()
b = (Bar()
.add_xaxis(x_data)
.add_yaxis(' ',y_data)
.set_global_opts(title_opts = opts.TitleOpts(title='Top 10 Cities to Review'))
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,position='right'))
.reversal_axis()
)
grid = Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(b, grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
Copy the code
First-tier cities have been listed, and it seems that these cities have done much better in patriotism education
Take a look at the national map of cities
result = df_new['cityName'].value_counts().sort_values()\
x_data = result.index.tolist()\
y_data = result.values.tolist()\
city_list = [list(z) for z in zip(x_data, y_data)]
Copy the code
4 Gender Distribution
What gender do these movies appeal to
attr = ["Other"."Male"."Female"]
b = (Pie()
.add(""[list(z) for z in zip(attr, df_new.groupby("gender").gender.count().values.tolist())])
.set_global_opts(title_opts = opts.TitleOpts(title='Sex distribution'))
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,position='right'))
)
grid = Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(b, grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
Copy the code
Surprisingly, more women were included in the gender data
5 Yes/No
Cat eyes can review movies without watching them. Let’s look at this data
result = df_new["sureViewed"].value_counts()[:10].sort_values().tolist()
b = (Pie()
.add(""[list(z) for z in zip(["Never seen."."Read"], result)])
.set_global_opts(title_opts = opts.TitleOpts(title='Have you seen it?'))
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,position='right'))
)
grid = Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(b, grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
Copy the code
It can be seen that most people make comments after watching them, which ensures the reliability of comments and ratings to a certain extent
6 Distribution of scores
It’s 10 points on the cat’s eye page, but 5 points on the interface
result = df_new["score"].value_counts().sort_values()
x_data = result.index.tolist()
y_data = result.values.tolist()
b = (Bar()
.add_xaxis(x_data)
.add_yaxis(' ',y_data)
.set_global_opts(title_opts = opts.TitleOpts(title='Score distribution'))
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,position='right'))
.reversal_axis()
)
grid = Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(b, grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
Copy the code
It can be seen that 5-4.5 reviews dominate the majority, and the word of mouth is really good
7 Time distribution of comments
For comment times, I’m using native Echarts directly here
from collections import Counter
result = df_new["comment_time"].values.tolist()
result = [i.split()[1].split(":") [0] + "Point" for i in result]
result_dict = dict(Counter(result))
result_list = []
for k,v in result_dict.items():
tmp = {}
tmp['name'] = k
tmp['value'] = v
result_list.append(tmp)
children_dict = {"children": result_list}
Copy the code
The sample address: echarts.apache.org/examples/zh…
As you can see, at 19:00 and 20:00 in the evening, it’s the peak time for people to write comments
8. Daily comment distribution
Here’s the distribution of comments per day
result = df_new["comment_time"].values.tolist()
result = [i.split()[0] for i in result]
result_dict = dict(Counter(result))
b = (Pie()
.add(""[list(z) for z in zip(result_dict.keys(), result_dict.values())])
.set_global_opts(title_opts = opts.TitleOpts(title='Number of comments per day'))
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,position='right'))
)
grid = Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(b, grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
Copy the code
So far, almost all the comments have focused on October 8th. Is it the first day of work?
9 User level Distribution
Take a look at the level of maoyan comment users, although I don’t know what this level is useful (😀)
result = df_new['userLevel'].value_counts()[:10].sort_values()
x_data = result.index.tolist()
y_data = result.values.tolist()
b = (Bar()
.add_xaxis(x_data)
.add_yaxis(' ',y_data)
.set_global_opts(title_opts = opts.TitleOpts(title='User level'))
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,position='right'))
.reversal_axis()
)
grid = Grid(init_opts=opts.InitOpts(theme=ThemeType.VINTAGE))
grid.add(b, grid_opts=opts.GridOpts(pos_left="20%"))
grid.render_notebook()
Copy the code
Everyone is basically level2, ha ha ha, the general public
10 Times of creator mentions
Let’s take a look at how many times creators are mentioned in the comments
name = ["吴京".Jackson Yi."Yihong Duan"."Zhu Yawen"."李晨"."Hu jun"."Wang ning"."Nomessage"."Items".Czy ""."李军"."Sun yi"."Yi"."Yi Yang"."Thousand seal"
]
def actor(data, name) :
counts = {}
comment = jieba.cut(str(data), cut_all=False)
# Stop using words
for word in comment:
if word in name:
if word == "Yi" or word == "Thousand seal" :
word = Jackson Yi
counts[word] = counts.get(word,0) +1
return counts
counts = actor(', '.join(df_comment.values.tolist()), name)
Copy the code
There is no doubt that Yi Yangqianxi holds the top spot, perhaps his mother’s fans are more, but his acting is also online
11 Comment word cloud
Finally, take a look at the word cloud of comments
font = r'C:\Windows\Fonts\FZSTK.TTF'
STOPWORDS = {"Reply"."@"."我"."She"."You"."He"."The"."The"."吧"."吗"."In"."Ah"."No"."Also"."Also"."Yes"."Said"."All"."It"."No." "."To do"."People"."Zhao wei"."Be"."Not"."Now"."What"."This"."呢"."Know"."Deng"."我们"."They"."And"."有"."".""."To be"."就是"."But"."And"."For"."His"."In"."Problem"."A"."没有"."To"."This"."And"."To"}
def wordcloud(data, name, pic=None) :
comment = jieba.cut(str(data), cut_all=False)
words = ' '.join(comment)
img = Image.open(pic)
img_array = np.array(img)
wc = WordCloud(width=2000, height=1800, background_color='white', font_path=font, mask=img_array,
stopwords=STOPWORDS, contour_width=3, contour_color='steelblue')
wc.generate(words)
wc.to_file(name + '.png')
Copy the code
Box office forecast for tomorrow
Here, we use linear regression to make simple box office prediction. After all, box office is a super complex thing, and there is no way to make a completely accurate forecast
Let’s take a look at the box office figures for Nagatsu Lake through AKShare
movie_boxoffice_daily_df = ak.movie_boxoffice_daily(date="20211008")
print(movie_boxoffice_daily_df)
movie_boxoffice_daily_df[movie_boxoffice_daily_df['Movie Title'].str.contains('Chosin Lake'] ['One-day box office'].values.tolist()[0]
Copy the code
So let’s draw a scatter plot to see what the trend is
def scatter_base(choose, values, date) -> Scatter:
c = (
Scatter()
.add_xaxis(choose)
.add_yaxis("%s/ Daily box office" % date, values)
.set_global_opts(title_opts=opts.TitleOpts(title=""),
# datazoom_opts=opts.DataZoomOpts(),
yaxis_opts=opts.AxisOpts(
axislabel_opts=opts.LabelOpts(formatter="{value} / m")
)
)
.set_series_opts(label_opts=opts.LabelOpts(is_show=False)))return c
date_list = create_assist_date("20211001"."20211008")
value_list = get_data("Chosin Lake", date_list)
scatter_base(date_list, value_list, 'Chosin Lake').render_notebook()
Copy the code
It can be seen that the single-day box office gradually increased from the 1st, reached the peak on the 7th, and began to decline on the 8th
Now let’s do the data fitting, using the Linear_model provided by SkLearn
date_list = create_assist_date("20211001"."20211008")
value_list = get_data("Chosin Lake", date_list)
X = np.array([1001.1002.1003.1004.1005.1006.1007.1008])
X = X.reshape(-1.1)
y = value_list
model = pl.make_pipeline(
sp.PolynomialFeatures(5), # Polynomial feature extender
lm.LinearRegression() # Linear regressor
)
# Training model
model.fit(X, y)
Evaluate the predicted value y
pred_y = model.predict(X)
print(pred_y)
Draw a polynomial regression line
px = np.linspace(X.min(), X.max(), 1000)
px = px.reshape(-1.1)
pred_py = model.predict(px)
# draw an image
mp.figure("Daily box office data", facecolor='lightgray')
mp.title('Daily Box office Data Regression', fontsize=16)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.xlabel('x')
mp.ylabel('y')
mp.scatter(X, y, s=60, marker='o', c='dodgerblue', label='Points')
mp.plot(px, pred_py, c='orangered', label='PolyFit Line')
mp.tight_layout()
mp.legend()
mp.show()
Copy the code
Then, according to the results of fitting, we will predict the box office situation tomorrow
All right, wait for the lottery tomorrow!