library(AER)
data("Journals")
# ?Journals
Lecture 4 – Exercise Problems
1 Exercise 1: Demand for Economics Journal (Stock and Watson (2017), p336-337)
Using Journals from the AER package, examine demand for economics journals (subscriptions at U.S. libraries, year 2000). Read ?Journals first.
Load the data with the following code. Check the description of the data with ?Journals
.
For anything else, exploring the data is the first step in any data analysis. “Data exploration” section contains the exercise to practice the data manipulation and visualization for descriptive data analysis. Now, it’s your turn to load the package as needed. (I would be happy if you would choose to use the packages we leaned before!)
Data exploration
List the top 10 journals by citations. Also list the top 10 by price.
Confirm the data year and show a short sentence/report (from the help page).
Compute summary stats (mean, median, SD) for:
price
,pages
,citations
, andsubs
.Make scatter plots of
price
vs.citations
andprice
vs.subs
usingfacets
function (two panels).
Regression Analysis
The goal here is to examine the demand for economics journals. Specifically, we want to estimate the price elasticity of demand for economics journals.
- First, create the following new variables:
citeprice
: the price per citation (handle divide-by-zero safely).
age
: the number of years since the journal was first published (2000 - foundingyear
).character_million
: number of characters (per issue) in millions.
- Estimate the following four regression models ((1)~(4)), and report the results in a table.
\[\begin{align} (1) \, log(subs) &= \beta_0 + \beta_1 log(citeprice) + e \\ (2) \, log(subs) &= \beta_0 + \beta_1 log(citeprice) + \beta_2 log(age) + e \\ (3) \, log(subs) &= \beta_0 + \beta_1 log(citeprice) + \beta_2 log(age) + \beta_3 log(character_million) + e \\ (4) \, log(subs) &= \beta_0 + \beta_1 log(citeprice) + \beta_2 log(age) + \beta_3 log(character_million) \end{align}\]
\end{equation}
# === Load packages === #
library(AER)
library(data.table)
library(ggplot2)
library(modelsummary)
# Data
data("Journals")
setDT(Journals)
# === Part 1 === #
order(-citations)][1:10, .(title, citations, price)] Journals[
title citations price
<char> <int> <int>
1: American Economic Review 8999 47
2: Econometrica 7943 178
3: Journal of Political Economy 6697 159
4: Quarterly Journal of Economics 4138 148
5: Journal of Finance 3791 226
6: Journal of the American Statistical Association 2800 310
7: Journal of Consumer Research 2762 90
8: Journal of Financial Economics 2676 1339
9: Economic Journal 2540 301
10: Journal of Economic Theory 2514 1400
order(-price)][1:10, .(title, price, citations)] Journals[
title price citations
<char> <int> <int>
1: Applied Economics 2120 578
2: Journal of Econometrics 1893 2479
3: Journal of Banking and Finance 1539 602
4: Economics Letters 1492 930
5: World Development 1450 1408
6: Journal of Public Economics 1431 1437
7: Journal of Economic Theory 1400 2514
8: Journal of Financial Economics 1339 2676
9: Research Policy 1234 922
10: Ecological Economics 1170 499
# === Part 2 === #
head(Journals[, .(title, subs, price, pages, citations)])
title subs price pages
<char> <int> <int> <int>
1: Asian-Pacific Economic Literature 14 123 440
2: South African Journal of Economic History 59 20 309
3: Computational Economics 17 443 567
4: MOCT-MOST Economic Policy in Transitional Economics 2 276 520
5: Journal of Socio-Economics 96 295 791
6: Labour Economics 15 344 609
citations
<int>
1: 21
2: 22
3: 22
4: 22
5: 24
6: 24
# === Part 3 === #
<- Journals[, .(
sumstats mean_price = mean(price, na.rm = TRUE),
median_price = median(price, na.rm = TRUE),
sd_price = sd(price, na.rm = TRUE),
mean_pages = mean(pages, na.rm = TRUE),
median_pages = median(pages, na.rm = TRUE),
sd_pages = sd(pages, na.rm = TRUE),
mean_cites = mean(citations, na.rm = TRUE),
median_cites = median(citations, na.rm = TRUE),
sd_cites = sd(citations, na.rm = TRUE),
mean_subs = mean(subs, na.rm = TRUE),
median_subs = median(subs, na.rm = TRUE),
sd_subs = sd(subs, na.rm = TRUE)
)] sumstats
mean_price median_price sd_price mean_pages median_pages sd_pages mean_cites
<num> <num> <num> <num> <num> <num> <num>
1: 417.7222 282 385.8346 827.7444 693 436.8174 647.0556
median_cites sd_cites mean_subs median_subs sd_subs
<num> <num> <num> <num> <num>
1: 262.5 1182.374 196.8667 122.5 204.5288
# === Part 4 === #
<- melt(
plot_dt
Journals[, .(price, citations, subs)],id.vars = "price",
variable.name = "yvar",
value.name = "y"
)
ggplot(plot_dt, aes(x = price, y = y)) +
geom_point() +
facet_wrap(~ yvar, scales = "free_y") +
labs(x = "Price", y = "", title = "Price vs. Citations/Subs") +
theme_bw()
# ---------- Regression analysis ----------
# === Part 1 === #
# Create variables
# founding year can be in 'foundingyear' or 'start' depending on AER version:
if ("foundingyear" %in% names(Journals)) {
:= 2000 - foundingyear]
Journals[, age else if ("start" %in% names(Journals)) {
} := 2000 - start]
Journals[, age else {
} stop("Founding year variable not found. Look for 'foundingyear' or 'start'.")
}
# character count per issue (AER::Journals often provides 'char')
if ("char" %in% names(Journals)) {
:= char / 1e6]
Journals[, character_million else if ("title" %in% names(Journals)) {
} # fallback: name length (crude)
:= nchar(as.character(title)) / 1e6]
Journals[, character_million else {
} stop("Neither 'char' nor 'title' found to construct character_million.")
}
# price per citation; avoid division-by-zero
:= price / pmax(citations, 1)]
Journals[, citeprice
# To safely log, add a small offset where needed:
<- 1e-6
eps := log(pmax(subs, eps))]
Journals[, l_subs := log(pmax(citeprice, eps))]
Journals[, l_citeprice := log(pmax(age, 1))] # age should be >=1
Journals[, l_age := log(pmax(character_million, eps))]
Journals[, l_char_mil
# === Part 2 === #
# Estimate models
<- lm(l_subs ~ l_citeprice, data = Journals)
reg1 <- lm(l_subs ~ l_citeprice + l_age, data = Journals)
reg2 <- lm(l_subs ~ l_citeprice + l_age + l_char_mil, data = Journals)
reg3 <- lm(l_subs ~ l_citeprice + l_age + l_char_mil - 1, data = Journals) # example w/o intercept per spec
reg4
# Report
modelsummary(
models = list("Model (1)" = reg1,
"Model (2)" = reg2,
"Model (3)" = reg3,
"Model (4)" = reg4),
coef_map = c(
"(Intercept)" = "Intercept",
"l_citeprice" = "log(Price per citation)",
"l_age" = "log(Age)",
"l_char_mil" = "log(Characters per issue, millions)"
),gof_map = c("nobs", "r.squared", "adj.r.squared"),
stars = c("*" = .05, "**" = .01, "***" = .001),
output = "gt",
notes = list("Std. Errors in parentheses")
)
Model (1) | Model (2) | Model (3) | Model (4) | |
---|---|---|---|---|
Intercept | 4.766*** | 3.396*** | 0.931 | |
(0.056) | (0.300) | (1.469) | ||
log(Price per citation) | -0.533*** | -0.434*** | -0.439*** | -0.438*** |
(0.036) | (0.040) | (0.040) | (0.040) | |
log(Age) | 0.419*** | 0.392*** | 0.394*** | |
(0.090) | (0.091) | (0.091) | ||
log(Characters per issue, millions) | -0.242 | -0.329*** | ||
(0.141) | (0.029) | |||
Num.Obs. | 180 | 180 | 180 | 180 |
R2 | 0.557 | 0.605 | 0.612 | 0.979 |
R2 Adj. | 0.555 | 0.601 | 0.605 | 0.979 |
* p < 0.05, ** p < 0.01, *** p < 0.001 | ||||
Std. Errors in parentheses |
2 Exercise 2: Analysis of the Test Score Dataset (Stock and Watson (2017), Chapter 7.6)
Using the CASchools
dataset from the AER
package, let’s do some regression analysis to examine the effects on test scores of the student-teacher ratio. The dataset contains data on test performance, school characteristics and student demographic backgrounds for school districts in California.
First, let’s load the data. See ?CASchools
for the description of variables in the dataset.
# === Loading Data === #
data("CASchools")
- Create the follwing new variables:
stratio
: student-teacher ratio (students/teachers
)score
: the average of math and reading scores ((math + read)/2
)
Using
datasummary
function, generate a summary table of key descriptive statistics for the CASchools dataset, including variables likescore
,stratio
,english
, andlunch
,calworks
.Using
ggplot
functions, Create scanter plots of test scores (score
) against the percentage of English language learners (english
), percentage qualifying for reduced-price lunch (lunch
), and percentage qualifying for income assistance (calworks
). (It would be nice if you could use thefacet_wrap()
function to show them at once.)Estimate the following five regression models and report the results in a table.
# === Load package === #
# This is redundant but I am loading the same packages we already used before just for the purpose of showing which package I am using to work on this problem. Note than you need to you load the package only once in your current R session.
library(AER)
library(data.table)
library(ggplot2)
library(modelsummary)
# === Data === #
data(CASchools)
setDT(CASchools)
# === Part 1 === #
`:=`(
CASchools[,stratio = students/teachers,
score = (math + read)/2
)]
# === Part 2 === #
datasummary(
formula = score + stratio + english + lunch + calworks ~ Mean + SD + Min + Max,
data = CASchools,
)
Mean | SD | Min | Max | |
---|---|---|---|---|
score | 654.16 | 19.05 | 605.55 | 706.75 |
stratio | 19.64 | 1.89 | 14.00 | 25.80 |
english | 15.77 | 18.29 | 0.00 | 85.54 |
lunch | 44.71 | 27.12 | 0.00 | 100.00 |
calworks | 13.25 | 11.45 | 0.00 | 78.99 |
# === Part 3 === #
<- melt(
long2
CASchools[, .(score, english, lunch, calworks)],id.vars = "score",
variable.name = "xvar",
value.name = "x"
)
ggplot(long2, aes(x = x, y = score)) +
geom_point() +
facet_wrap(~ xvar, scales = "free_x") +
labs(x = "", y = "Score", title = "Score vs. Demographic/Program Shares") +
theme_bw()
# === Part 4 === #
<- lm(score ~ stratio, data = CASchools)
rge1 <- lm(score ~ stratio + english, data = CASchools)
rge2 <- lm(score ~ stratio + english + lunch, data = CASchools)
reg3 <- lm(score ~ stratio + english + calworks, data = CASchools)
reg4 <- lm(score ~ stratio + english + lunch + calworks, data = CASchools)
reg5
modelsummary(
models = list(rge1, rge2, reg3, reg4, reg5),
stars = c("*" = .05, "**" = .01, "***" = .001),
coef_map = c(
"stratio" = "Student-teacher ratio",
"english" = "Percent Englisch learners",
"lunch" = "Percent eligible for subsidiezxed lunch",
"calworks" = "Percent qualifying for CalWorks.",
"(Intercept)" = "Intercept"
),gof_map = c("nobs", "r.squared", "adj.r.squared"),
fmt = 2,
notes = list("Std. Errors in parentheses")
)
(1) | (2) | (3) | (4) | (5) | |
---|---|---|---|---|---|
* p < 0.05, ** p < 0.01, *** p < 0.001 | |||||
Std. Errors in parentheses | |||||
Student-teacher ratio | -2.28*** | -1.10** | -1.00*** | -1.31*** | -1.01*** |
(0.48) | (0.38) | (0.24) | (0.31) | (0.24) | |
Percent Englisch learners | -0.65*** | -0.12*** | -0.49*** | -0.13*** | |
(0.04) | (0.03) | (0.03) | (0.03) | ||
Percent eligible for subsidiezxed lunch | -0.55*** | -0.53*** | |||
(0.02) | (0.03) | ||||
Percent qualifying for CalWorks. | -0.79*** | -0.05 | |||
(0.05) | (0.06) | ||||
Intercept | 698.93*** | 686.03*** | 700.15*** | 698.00*** | 700.39*** |
(9.47) | (7.41) | (4.69) | (6.02) | (4.70) | |
Num.Obs. | 420 | 420 | 420 | 420 | 420 |
R2 | 0.051 | 0.426 | 0.775 | 0.629 | 0.775 |
R2 Adj. | 0.049 | 0.424 | 0.773 | 0.626 | 0.773 |