| import numpy as np
|
| import os
|
| import urllib.request
|
| import zipfile
|
| import json
|
| import pandas as pd
|
| import time
|
| import torch
|
| import numpy as np
|
| import pandas as pd
|
| import torch.nn as nn
|
| import torch.nn.functional as F
|
| import torch.optim as optim
|
| from torch.utils.data import DataLoader, TensorDataset
|
| from sklearn.model_selection import train_test_split
|
| import matplotlib.pyplot as plt
|
| from sklearn.preprocessing import LabelEncoder
|
| import shutil
|
| import os
|
| import pyarrow.parquet as pq
|
|
|
| def make_dir(directory):
|
| if os.path.exists(directory):
|
| shutil.rmtree(directory)
|
| os.makedirs(directory)
|
| else:
|
| os.makedirs(directory)
|
|
|
|
|
| def read_parquet_folder(folder_path):
|
| dataframes = []
|
| for file in os.listdir(folder_path):
|
| if file.endswith('.parquet'):
|
| file_path = os.path.join(folder_path, file)
|
| df = pd.read_parquet(file_path)
|
| dataframes.append(df)
|
|
|
| return pd.concat(dataframes, ignore_index=True)
|
|
|
|
|
| def create_ids(df, col, name):
|
|
|
| value_to_id = {val: i for i, val in enumerate(df[col].unique())}
|
|
|
|
|
| df[f'{name}_id'] = df[col].map(value_to_id)
|
| df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')
|
|
|
| return df
|
|
|
| if __name__ == '__main__':
|
| folder_path = os.getcwd() + '/data/raw/data'
|
| df = read_parquet_folder(folder_path)
|
|
|
| directory = os.getcwd() + '/data/processed'
|
| make_dir(directory)
|
|
|
| df = create_ids(df, 'artist_name', 'artist')
|
| df = create_ids(df, 'pid', 'playlist')
|
| df = create_ids(df, 'album_name', 'album')
|
|
|
| df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')
|
| df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')
|
| df['playlist_songs'] += 1
|
|
|
| df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)
|
| value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}
|
| df['artist_album_id'] = df['artist_album'].map(value_to_id)
|
|
|
| df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')
|
|
|
| df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')
|
|
|
| encoder = LabelEncoder()
|
| encoder.fit(df['track_name'])
|
|
|
| df['track_id'] = encoder.transform(df['track_name'])
|
| df['song_percent'] = df['song_count'] / df['playlist_songs']
|
| df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))
|
|
|
| artists = df.loc[:,['playlist_id','artist_album_id','song_percent']].drop_duplicates()
|
| artists.loc[:,['playlist_id','artist_album_id',]].to_csv(os.getcwd() + '/data/processed/playlists.csv')
|
|
|