library(tidyverse)
library(yaml)
library(coda)

── Attaching core tidyverse packages ───────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.3     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

config_id <- "mlml6_rate_pred_clsp"

main_config <- yaml.load_file(paste0("../experiments/configs/", config_id, "/main.yaml"))
dataset_id <- main_config$dataset_id

df <- read_csv(paste0("../data/pitchfork/", dataset_id, "/summary_df.csv"))

Rows: 22063 Columns: 8
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (6): artist, album, reviewer, genre, label, reviewed
dbl (2): rating, review_n_tokens

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

df %>% head()

# Settings for plots
options(repr.plot.width = 16, repr.plot.height = 8, repr.plot.res = 100)

# Helpers to avoid redundant code
set_theme_elements <- function(plt) {
    plt +
        theme_classic() +
        theme(text = element_text(size=16))
}

df %>%
    add_count(album, artist, rating, reviewer, reviewed) %>%
    filter(n > 1) %>%
    nrow()

df %>%
    pull(review_n_tokens) %>%
    quantile() %>%
    as_tibble(rownames="quantile")

hdr_result <- HPDinterval(as.mcmc(df %>% pull(review_n_tokens)), prob = 0.9)
hdr_lower <- hdr_result[,1]
hdr_upper <- hdr_result[,2]
print(hdr_lower)
print(hdr_upper)

[1] 550
[1] 922

plt <- df %>%
    ggplot() +
    geom_density(aes(review_n_tokens), alpha=0.7, fill="gray") +
    geom_vline(aes(xintercept = hdr_lower), color = "firebrick", linetype = "dashed") +
    geom_vline(aes(xintercept = hdr_upper), color = "firebrick", linetype = "dashed")

plt <- plt %>%
    set_theme_elements() +
    xlab("Num. Tokens") +
    ylab("Density")

plt

df %>%
    pull(rating) %>%
    quantile() %>%
    as_tibble(rownames="quantile")

hdr_result <- HPDinterval(as.mcmc(df %>% pull(rating)), prob = 0.9)
hdr_lower <- hdr_result[,1]
hdr_upper <- hdr_result[,2]
print(hdr_lower)
print(hdr_upper)

[1] 5.5
[1] 9

plt <- df %>%
    ggplot() +
    geom_density(aes(rating), alpha=0.7, fill="gray") +
    geom_vline(aes(xintercept = hdr_lower), color = "firebrick", linetype = "dashed") +
    geom_vline(aes(xintercept = hdr_upper), color = "firebrick", linetype = "dashed")

plt <- plt %>%
    set_theme_elements() +
    xlab("Rating") +
    ylab("Density")

plt

# ggsave("pitchfork_rating_distribution.png", plot = plt, height = 4, width = 10, dpi = 300)

n_artists <- df %>%
    distinct(artist) %>%
    nrow()
n_artists

n_rows <- df %>%
    nrow()

round(n_rows / n_artists, digits = 2)

album_count_df <- df %>%
    # Mindful of albums with multiple reviews
    distinct(artist, album) %>%
    count(artist, name = "n_albums")

album_count_df %>%
    pull(n_albums) %>%
    quantile() %>%
    as_tibble(rownames="quantile")

album_count_df %>%
    arrange(desc(n_albums)) %>%
    head()

album_count_df <- album_count_df %>%
    filter(artist != "Various Artists")

album_count_df %>%
    pull(n_albums) %>%
    quantile() %>%
    as_tibble(rownames="quantile")

plt <- album_count_df %>%
    ggplot() +
    geom_bar(aes(n_albums), alpha=0.7, fill="gray")

plt <- plt %>%
    set_theme_elements() +
    xlab("Num. Albums") +
    ylab("Count")

plt

df %>%
    count(artist, album, name = "n_reviews") %>%
    filter(n_reviews > 1) %>%
    nrow()

plt <- df %>%
    count(reviewer, name="n_reviews") %>%
    # Filter out reviewers with very few reviews
    filter(n_reviews > 3) %>%
    ggplot() +
    geom_density(aes(n_reviews), alpha=0.7, fill="gray") +
    geom_point(aes(x=n_reviews, y=0), shape="|", size=3)

plt <- plt %>%
    set_theme_elements() +
    xlab("Num. Reviews") +
    ylab("Density")

plt

plt <- df %>%
    group_by(reviewer) %>%
    summarize(mean_rating = mean(rating), n_ratings = n()) %>%
    filter(n_ratings > 3) %>%
    ggplot() +
    geom_density(aes(mean_rating), alpha=0.7, fill="gray") +
    geom_point(aes(x=mean_rating, y=0), shape="|", size=3)

plt <- plt %>%
    set_theme_elements() +
    xlab("Rating") +
    ylab("Density")

plt

tnl_med_rating <- df %>%
    filter(artist == "The National") %>%
    summarize(median_rating = median(rating)) %>%
    pull(median_rating)
tnl_med_rating

df %>%
    filter(artist != "The National") %>%
    add_count(artist, name = "n_album_reviews") %>%
    filter(n_album_reviews > 5) %>%
    group_by(artist) %>%
    summarize(median_rating = median(rating)) %>%
    pull(median_rating) %>%
    quantile() %>%
    as_tibble(rownames = "quantile")

artist	album	rating	reviewer	genre	label	reviewed	review_n_tokens
<chr>	<chr>	<dbl>	<chr>	<chr>	<chr>	<chr>	<dbl>
Joni Mitchell	Joni Mitchell at Newport	6.0	Grayson Haver Currin	Rock	Rhino	July 29, 2023	721
The Clientele	I Am Not There Anymore	8.0	Marc Hogan	Rock	Merge	July 28, 2023	718
Nas	Magic 2	6.5	Dylan Green	Rap	Mass Appeal	July 28, 2023	736
Beverly Glenn-Copeland	The Ones Ahead	8.0	Jesse Dorris	Experimental	Transgressive	July 28, 2023	704
Damon Locks	New Future City Radio	6.9	Matthew Blackwell	Jazz / Experimental	International Anthem	July 28, 2023	797
DJ K	PANICO NO SUBMUNDO	7.9	Nadine Smith	Experimental	Nyege Nyege Tapes	July 27, 2023	682

quantile	value
<chr>	<dbl>
0%	17
25%	631
50%	707
75%	804
100%	1231

quantile	value
<chr>	<dbl>
0%	0.0
25%	6.6
50%	7.3
75%	7.8
100%	10.0

quantile	value
<chr>	<dbl>
0%	1
25%	1
50%	1
75%	2
100%	711

artist	n_albums
<chr>	<int>
Various Artists	711
Neil Young	33
Gucci Mane	25
Guided by Voices	25
Bob Dylan	24
David Bowie	23

Exploratory dataset analysis¶

Configurations¶

Load data¶

Exploratory analysis¶

Spot checks¶

Duplicate check¶

Key investigations¶

Distribution of token counts¶

Distribution of ratings¶

Other investigations¶

Unique artists¶

Artist album counts¶

Multiple reviews¶

Distribution of review counts per reviewer¶

Distribution of average rating among reviewers¶

The National - where do they stand?¶