| import os
|
| import urllib.request
|
| import zipfile
|
| import json
|
| import pandas as pd
|
| import time
|
| import torch
|
| import numpy as np
|
| import pandas as pd
|
| import torch.nn as nn
|
| import torch.nn.functional as F
|
| import torch.optim as optim
|
| from torch.utils.data import DataLoader, TensorDataset
|
| from sklearn.model_selection import train_test_split
|
| import matplotlib.pyplot as plt
|
| from sklearn.preprocessing import LabelEncoder
|
| import shutil
|
| import os
|
| import pyarrow.parquet as pq
|
|
|
|
|
| cols = [
|
| 'name',
|
| 'pid',
|
| 'num_followers',
|
| 'pos',
|
| 'artist_name',
|
| 'track_name',
|
| 'album_name'
|
| ]
|
|
|
|
|
| def copy_file(src, dst):
|
|
|
| dst_dir = os.path.dirname(dst)
|
| if not os.path.exists(dst_dir):
|
| os.makedirs(dst_dir)
|
|
|
| shutil.copy2(src, dst)
|
|
|
| def unzip_archive(filepath, dir_path):
|
| with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
|
| zip_ref.extractall(dir_path)
|
|
|
|
|
| def make_dir(directory):
|
| if os.path.exists(directory):
|
| shutil.rmtree(directory)
|
| os.makedirs(directory)
|
| else:
|
| os.makedirs(directory)
|
|
|
|
|
| def make_dataset():
|
| directory = os.getcwd() + '/data/raw/playlists/data'
|
| df = pd.DataFrame()
|
| index = 0
|
|
|
| for filename in os.listdir(directory):
|
|
|
| if os.path.isfile(os.path.join(directory, filename)):
|
| if filename.find('.json') != -1 :
|
| index += 1
|
|
|
|
|
| print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='')
|
|
|
|
|
| full_path = os.path.join(directory, filename)
|
|
|
| with open(full_path, 'r') as file:
|
| json_data = json.load(file)
|
|
|
| temp = pd.DataFrame(json_data['playlists'])
|
| expanded_df = temp.explode('tracks').reset_index(drop=True)
|
|
|
|
|
| json_normalized = pd.json_normalize(expanded_df['tracks'])
|
|
|
|
|
| result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)
|
|
|
| result = result[cols]
|
|
|
| df = pd.concat([df, result], axis=0, ignore_index=True)
|
|
|
| if index % 50 == 0:
|
| df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')
|
| del df
|
| df = pd.DataFrame()
|
| if index % 200 == 0:
|
| break
|
|
|
|
|
| if __name__ == '__main__':
|
| unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')
|
| directory = os.getcwd() + '/data/raw/data'
|
| make_dir(directory)
|
| directory = os.getcwd() + '/data/processed'
|
| make_dir(directory)
|
| make_dataset()
|
|
|
|
|