Mobile apps are everywhere. They are easy to create and can be very lucrative from the business standpoint. Specifically, Android is expanding as an operating system and has captured more than 74% of the total market[1].
The Google Play Store apps data has enormous potential to facilitate data-driven decisions and insights for businesses. In this notebook, we will analyze the Android app market by comparing ~10k apps in Google Play across different categories. We will also use the user reviews to draw a qualitative comparision between the apps.
The dataset you will use here was scraped from Google Play Store in September 2018 and was published on Kaggle. Here are the details:
From here on, it will be your task to explore and manipulate the data until you are able to answer the three questions described in the instructions panel.
The three questions are:
Read the apps.csv file and clean the Installs column to convert it into integer data type. Save your answer as a DataFrame apps.
Find the number of apps in each category, the average price, and the average rating. Save your answer as a DataFrame _app_categoryinfo. You should rename the four columns as: Category, Number of apps, Average price, Average rating.
Find the top 10 free FINANCE apps having the highest average sentiment score. Save your answer as a DataFrame _top_10_userfeedback. Your answer should have exactly 10 rows and two columns named: App and Sentiment Score, where the average Sentiment Score is sorted from highest to lowest.
#importing pandas and explore apps datasets/app.csv
import pandas as pd
apps = pd.read_csv('datasets/apps.csv')
apps.head()
#read and explore apps datasets/app.csv
user_reviews = pd.read_csv('datasets/user_reviews.csv')
user_reviews.head()
#Remove non-numerical charecter from the column 'Install' e convert it in integer data type.
apps['Installs'] = apps['Installs'].str.replace(',','').str.replace('+','')
apps['Installs'] = apps['Installs'].astype(int)
apps.head()
#ensuring 'Installs' column is now an integer data type
apps['Installs'].dtype
#Create a Dataframe with Average price, Average rating and numers of app per Category.
app_category_info = apps.groupby('Category').agg(
{'Category' : 'count',
'Price' : 'mean',
'Rating': 'mean'})
#changing columns name
app_category_info = app_category_info.rename(columns={
'Category': 'Number of apps',
'Price': 'Average price',
'Rating': 'Average rating'
}).reset_index()
#explore few rows of the new dataframe
app_category_info.head()
#creating a new df with a list of free finance apps and explore it
free_finance_apps = apps.query('Category =="FINANCE" and Type=="Free"')
free_finance_apps.head()
#merging free_finance_apps with user_review
free_finance_app_w_reviews = free_finance_apps.merge(user_reviews, on='App', how='left')
free_finance_app_w_reviews.head()
#finding the top 10 free Finance App with highest average sentiment
top_10_user_feedback = (pd.DataFrame(free_finance_app_w_reviews.groupby('App')['Sentiment Score'].mean()))\
.sort_values('Sentiment Score', ascending = False).head(10)
#I grouped the free_finance_app_w_reviews by 'App' to obtain the 'Sentiment Score' mean. After that I sorted in descending order the 'Sentiment Score' column, end extracted the first 10 row with head()
top_10_user_feedback
View in the worlspace here: https://app.datacamp.com/workspace/w/ff79ae99-157f-4b83-b8b0-80f6d03c16bd/edit
%%nose
# %%nose needs to be included at the beginning of every @tests cell
# https://instructor-support.datacamp.com/en/articles/4544008-writing-project-tests-guided-and-unguided-r-and-python
# The @solution should pass the tests
# The purpose of the tests is to try to catch common errors and
# to give the student a hint on how to resolve these errors
import numpy as np
correct_apps = pd.read_csv('datasets/apps.csv')
correct_reviews = pd.read_csv('datasets/user_reviews.csv')
# List of characters to remove
chars_to_remove = ['+', ',']
# Replace each character with an empty string
for char in chars_to_remove:
correct_apps['Installs'] = correct_apps['Installs'].apply(lambda x: x.replace(char, ''))
# Convert col to int
correct_apps['Installs'] = correct_apps['Installs'].astype(int)
def test_pandas_loaded():
assert ('pandas' in globals() or 'pd' in globals()), "pandas is not imported."
def test_installs_plus():
assert '+' not in apps['Installs'], \
'The special character "+" has not been removed from Installs column.'
def test_installs_comma():
assert ',' not in apps['Installs'], \
'The special character "," has not been removed from the Installs column.'
def test_installs_numeric():
assert isinstance(apps['Installs'][0], np.int64), \
'The Installs column is not of numeric data type (int).'
def test_q1_app_category_info_columns():
# when DataFrame in MultiIndex
if 'BEAUTY' in app_category_info.index:
assert all(x in app_category_info.columns for x in ['Number of apps', 'Average price', 'Average rating']), \
"Some columns are missing or incorrectly named in your app_category_info DataFrame. Make sure there are 4 columns named: 'Category', 'Number of apps', 'Average price', 'Average rating'."
else:
"Some columns are missing or incorrectly named in your app_category_info DataFrame. Make sure there are 4 columns named: 'Category', 'Number of apps', 'Average price', 'Average rating'."
def test_q1_app_category_info_app_count():
if 'Number of apps' in app_category_info.reset_index().columns:
correct_app_category_info = correct_apps.groupby(['Category']).agg({'App':'count', 'Price': 'mean', 'Rating': 'mean'}).reset_index()
correct_app_category_info = correct_app_category_info.rename(columns={"App": "Number of apps", "Price": "Average price", "Rating": "Average rating"})
correct_app_count = correct_app_category_info['Number of apps']
# convert to single index and compare
app_count = app_category_info.reset_index().sort_values(by='Category')['Number of apps']
assert correct_app_count.equals(app_count),\
"The aggregate function used to calculate \"Number of apps\" is incorrect."
else:
assert False, "\"Number of apps\" column is missing in your app_category_info DataFrame."
def test_q1_app_category_info_avg_price():
if 'Average price' in app_category_info.reset_index().columns:
correct_app_category_info = correct_apps.groupby(['Category']).agg({'App':'count', 'Price': 'mean', 'Rating': 'mean'}).reset_index()
correct_app_category_info = correct_app_category_info.rename(columns={"App": "Number of apps", "Price": "Average price", "Rating": "Average rating"})
correct_app_count = correct_app_category_info['Average price']
# convert to single index and compare
app_count = app_category_info.reset_index().sort_values(by='Category')['Average price']
assert correct_app_count.equals(app_count),\
"The aggregate function used to calculate \"Average price\" is incorrect."
else:
assert False, "\"Average price\" column is missing in your app_category_info DataFrame."
def test_q1_app_category_info_avg_rating():
if 'Average rating' in app_category_info.reset_index().columns:
correct_app_category_info = correct_apps.groupby('Category').agg({'App':'count', 'Price': 'mean', 'Rating': 'mean'}).reset_index()
correct_app_category_info = correct_app_category_info.rename(columns={"App": "Number of apps", "Price": "Average price", "Rating": "Average rating"})
correct_app_count = correct_app_category_info['Average rating']
# convert to single index and compare
app_count = app_category_info.reset_index().sort_values(by='Category')['Average rating']
assert correct_app_count.equals(app_count),\
"The aggregate function used to calculate \"Average rating\" is incorrect."
else:
assert False, "\"Average rating\" column is missing in your app_category_info DataFrame."
# def test_reviews_loaded():
# assert (correct_reviews.equals(reviews)), "The dataset was not read correctly into reviews."
def test_q2_finance_apps():
correct_finance_apps = correct_apps[(correct_apps['Type'] == 'Free') & (correct_apps['Category'] == 'FINANCE')]['App']
# if App column is the index
if top_10_user_feedback.index.name == 'App':
finance_apps = top_10_user_feedback.index
assert(set(finance_apps).issubset(set(correct_finance_apps))),\
"You have not selected the free finance apps correctly. Check your answer again."
else:
finance_apps = top_10_user_feedback['App']
assert(set(finance_apps).issubset(set(correct_finance_apps))),\
"You have not selected the free finance apps correctly. Check your answer again."
def test_q2_top_10():
assert(len(top_10_user_feedback) == 10), "You have selected more than 10 apps. Please select only top 10 apps with highest average sentiment score."
def test_q2_sorted():
correct_finance_apps = correct_apps[(correct_apps['Type'] == 'Free') & (correct_apps['Category'] == 'FINANCE')]
correct_merged_df = pd.merge(correct_finance_apps, correct_reviews, on = "App", how = "inner")
correct_app_sentiment_score = correct_merged_df.groupby('App').agg({'Sentiment Score': 'mean'}).reset_index()
correct_sorted_apps = correct_app_sentiment_score.sort_values(by = 'Sentiment Score', ascending = False)[:10]
# if App column is the index
if top_10_user_feedback.index.name == 'App':
sorted_apps = top_10_user_feedback.index
assert(list(sorted_apps) == list(correct_sorted_apps['App'])),\
"You have not sorted top_10_user_feedback correctly. Make sure to sort your DataFrame on Sentiment Score from highest to lowest (ie - in decreasing order)."
else:
sorted_apps = top_10_user_feedback['App']
assert(list(sorted_apps) == list(correct_sorted_apps['App'])),\
"You have not sorted top_10_user_feedback correctly. Make sure to sort your DataFrame on Sentiment Score from highest to lowest (ie - in decreasing order)."
def test_q2():
correct_finance_apps = correct_apps[(correct_apps['Type'] == 'Free') & (correct_apps['Category'] == 'FINANCE')]
correct_merged_df = pd.merge(correct_finance_apps, correct_reviews, on = "App", how = "inner")
correct_app_sentiment_score = correct_merged_df.groupby('App').agg({'Sentiment Score': 'mean'}).reset_index()
correct_top_10_user_feedback = correct_app_sentiment_score.sort_values(by = 'Sentiment Score', ascending = False).reset_index()[:10]
correct_app_sentiment_score_multiindex = correct_merged_df.groupby('App').agg({'Sentiment Score': 'mean'})
correct_top_10_user_feedback_multiindex = correct_app_sentiment_score_multiindex.sort_values(by = 'Sentiment Score', ascending = False)[:10]
# if App column is the index
if top_10_user_feedback.index.name == 'App':
assert (correct_top_10_user_feedback_multiindex.equals(top_10_user_feedback)), "You have not computed top_10_user_feedback correctly. Some values are wrong."
else:
top_10_user_feedback_apps = top_10_user_feedback['App']
top_10_user_feedback_sentiment_score = top_10_user_feedback['Sentiment Score']
assert (list(top_10_user_feedback_apps) == list(correct_top_10_user_feedback['App']) and
list(top_10_user_feedback_sentiment_score) == list(correct_top_10_user_feedback['Sentiment Score'])), "You have not computed top_10_user_feedback correctly. Some values are wrong."