import multiprocessing
import yaml
from pathlib import Path

from datasets import load_dataset, Dataset
from transformers import AutoTokenizer

# Custom package for this project
import myutilpy.data_processing as dprep

config_id = "mlml6_rate_pred_clsp"
num_cores_avail = max(1, multiprocessing.cpu_count() - 1)

with open(f"../experiments/configs/{config_id}/main.yaml", 'r') as f:
    main_config = yaml.safe_load(f)

dataset_checkpoint = main_config["dataset_checkpoint"]
dataset_checkpoint_revision = main_config["dataset_checkpoint_revision"]
pt_model_checkpoint = main_config["pt_model_checkpoint"]
pt_model_checkpoint_revision = main_config["pt_model_checkpoint_revision"]
dataset_id = main_config["dataset_id"]
data_seed = main_config["data_seed"]

root_dataset_dir = f"../data/pitchfork/{dataset_id}"
raw_data_cache_dir = f"../data/pitchfork/raw/cache"
Path(raw_data_cache_dir).mkdir(parents=True, exist_ok=True)
Path(root_dataset_dir).mkdir(parents=True, exist_ok=True)

tokenizer = AutoTokenizer.from_pretrained(
    pt_model_checkpoint,
    revision=pt_model_checkpoint_revision
)

# Make sure to specify "reviews.csv" since it will default to album images
raw_datasets = load_dataset(
    dataset_checkpoint,
    revision=dataset_checkpoint_revision,
    data_files=["reviews.csv"],
    cache_dir=raw_data_cache_dir
)

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url'],
        num_rows: 25709
    })
})

dataset = raw_datasets["train"]

# The artist, album, review, and reviewer columns should be strings (e.g., should not be None)
dataset = dataset.filter(
    lambda examples: dprep.detect_wrong_type_batched(examples, ["artist", "album", "review", "reviewer"], str),
    batched=True,
    num_proc=num_cores_avail
)

dataset

Dataset({
    features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url'],
    num_rows: 23034
})

dataset = Dataset.from_pandas(
    dataset.to_pandas().drop_duplicates().reset_index(drop=True)
)

dataset

Dataset({
    features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url'],
    num_rows: 22063
})

blacklist_pattern = dprep.get_blacklist_pattern(dataset_id)

# Replace known "unk" tokens
dataset = dataset.map(
    lambda examples: dprep.replace_known_unk_tokens_batched(examples, ["artist", "album", "review", "reviewer"], blacklist_pattern),
    batched=True,
    num_proc=num_cores_avail
)

Map (num_proc=15):   0%|          | 0/22063 [00:00<?, ? examples/s]

dataset_leftover = dataset.filter(
    lambda examples: dprep.detect_unk_batched(examples, ["review"], tokenizer),
    batched=True,
    num_proc=num_cores_avail
)

Filter (num_proc=15):   0%|          | 0/22063 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (660 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (721 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (586 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (664 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (924 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (852 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (670 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (562 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (902 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (572 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (727 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (791 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (623 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (568 > 512). Running this sequence through the model will result in indexing errors

dataset_leftover

Dataset({
    features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url'],
    num_rows: 48
})

unk_tokens = set()
for i in range(len(dataset_leftover)):
    text = dataset_leftover[i]["review"]
    inputs = tokenizer(text, return_offsets_mapping=True)
    ids = inputs.input_ids
    offsets = inputs.offset_mapping
    
    for j, id in enumerate(ids):
        if id == tokenizer.unk_token_id:
            unk_tokens.add(text[offsets[j][0]: offsets[j][1]])

Token indices sequence length is longer than the specified maximum sequence length for this model (773 > 512). Running this sequence through the model will result in indexing errors

print(*unk_tokens)

개꿈 佛 うたのきしゃ 先 霊 emphatic¸ ♑ 敗 七 冥 愚 閃 玉 想 □ 音 所 靈 蒸 ‽ 绿 观 ƚI 界 戰 卡 節 轉 d​ᴉ​lɟ 偉 乱 去 駭 共 狗 36℃ 夢 者 燕 詩 14℃ 只 ؟ 10℃ YTI⅃AƎЯ ♈ ◕ 印 옛날이야기 會 ዘላለም 兰 疊 鬼 物 💯 傍 剣 ɪᴍᴘᴀᴄᴛ21 指 ¯ ❀ 縞 浴 ƨbnƎ ⌘v 殺 蛰 ☕ 制 怕 奏 茶 過 ☽ 박혜진 念 吸 九 観 惊 曜 希 ゾット 重 害 來 呼 隠 波 象 ｡ Ⓡ 市 廁 0℃ 17℃ 幽 與 苑 客 ˂stranger˃ 縦 矮 ✓ ⌘

print(dataset_leftover["artist"])

['Lucy Liyou', 'Mark Barrott', 'Tzusing', 'Lucinda Chua', 'otay:onii', 'Two Shell', 'Bill Callahan', 'Sam Gendel', 'Willow', 'death’s dynamic shroud', '4s4ki', 'Tatsuro Yamashita', 'Two Shell', 'Whatever the Weather', 'Pan Daijing', 'JPEGMAFIA', 'Yikii', '박혜진 Park Hye Jin', 'Pan Daijing', 'Jusell, Prymek, Sage, Shiroishi', 'Rian Treanor', '박혜진 Park Hye Jin', 'Okkyung Lee', 'Gong Gong Gong 工工工', 'Fire-Toolz', 'Brian Eno', 'BTS', 'HARAM', 'RRUCCULLA', 'George Clanton', 'Fire-Toolz', 'Meuko! Meuko!', 'BTS', 'Mukqs', 'Guided by Voices', 'Varg2TM', 'Grandaddy', 'Toyomu', 'Mikael Seifu', 'Especia', 'Creepoid', 'Kosmo Kat', 'TV on the Radio', 'Lee', 'Ryan Hemsworth', 'Javelin', 'The Soft Moon', 'Pit Er Pat']

print(dataset_leftover["album"])

['Dog Dreams (개꿈)', 'Jōhatsu (蒸発)', '绿帽 Green Hat', 'YIAN', '夢之駭客 Dream Hacker', 'lil spirits', 'YTI⅃AƎЯ', 'Blueblue', '<CopingMechanism>', 'Darklife', 'Killer in Neverland', 'Softly', 'Icons EP', 'Whatever the Weather', 'Tissues', 'LP!', 'Crimson Poem', 'Before I Die', 'Jade 玉观音', 'Fuubutsushi (風物詩)', 'File Under UK Metaplasm', 'How can I', 'Yeo\u200b-\u200bNeun', 'Phantom Rhythm 幽靈節奏 (幽霊リズム)', 'Field Whispers (Into the Crystal Palace)', 'Apollo: Atmospheres & Soundtracks - Extended Edition', 'MAP OF THE SOUL : PERSONA', 'وين كنيت بي 11\u200b/\u200b9؟? “Where Were You on 9\u200b/\u200b11\u200b?\u200b” EP', 'SHuSH', 'Slide', 'Skinless X-1', '鬼島 Ghost Island EP', 'Love Yourself 轉 ‘Tear’', '起き上がり', 'August by Cake', 'Nordic Flora Series Pt. 3: Gore-Tex City', 'Last Place', '印象III : なんとなく、パブロ (Imagining “The Life of Pablo”)', 'Zelalem', 'Carta', 'Cemetery Highrise Slum', 'Square EP', 'Seeds', 'TANHÂ', 'Still Awake EP', 'Hi Beams', 'Zeros', 'High Time']

dataset = dataset.map(
    lambda examples: dprep.get_n_tokens_batched(examples, "review", tokenizer),
    batched=True,
    num_proc=num_cores_avail
)

Map (num_proc=15):   0%|          | 0/22063 [00:00<?, ? examples/s]

summary_dataset_df = (
    dataset
        .remove_columns(["year_released", "small_text", "album_art_url", "review"])
        .to_pandas()
)

# First, split the dataset into "train" and "test" where "test" will be used to
# build the true "validation" and "test" splits
datasets = dataset.train_test_split(test_size=0.3, seed=data_seed)

# Now, split the temp dataset into validation and test sets
datasets_val_test = datasets.pop("test").train_test_split(test_size=0.5, seed=data_seed)
datasets["validation"] = datasets_val_test.pop("train")
datasets["test"] = datasets_val_test.pop("test")

datasets

DatasetDict({
    train: Dataset({
        features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url', 'review_n_tokens'],
        num_rows: 15444
    })
    validation: Dataset({
        features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url', 'review_n_tokens'],
        num_rows: 3309
    })
    test: Dataset({
        features: ['artist', 'album', 'year_released', 'rating', 'small_text', 'review', 'reviewer', 'genre', 'label', 'reviewed', 'album_art_url', 'review_n_tokens'],
        num_rows: 3310
    })
})

summary_dataset_df.to_csv(f"{root_dataset_dir}/summary_df.csv", index=False)
datasets.save_to_disk(f"{root_dataset_dir}/dataset")

Saving the dataset (0/1 shards):   0%|          | 0/15444 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3309 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3310 [00:00<?, ? examples/s]

Initial Dataset Preparation¶

Configurations¶

Tokenizer and dataset loading¶

Preprocess raw dataset¶

Missing data¶

Duplicates¶

Unknown tokens¶

Closer look¶

Analysis prep¶

Token counts¶

Collect summary features into dataframe¶

Split data¶

Save out data¶