Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
""" Returns the taste profile subset from the million song dataset:
https://labrosa.ee.columbia.edu/millionsong/tasteprofile
Data returned is a tuple of (trackinfo, user, plays) where
plays is a CSR matrix of with the rows being the track, columns being
the user and the values being the number of plays.
Trackinfo is a an array of tuples (trackid, artist, album, song name),
with the position corresponding to the rowid of the plays matrix. Likewise
users is an array of the user identifiers.
"""
filename = os.path.join(_download.LOCAL_CACHE_DIR, "msd_taste_profile.hdf5")
if not os.path.isfile(filename):
log.info("Downloading dataset to '%s'", filename)
_download.download_file(URL, filename)
else:
log.info("Using cached dataset at '%s'", filename)
with h5py.File(filename, 'r') as f:
m = f.get('track_user_plays')
plays = csr_matrix((m.get('data'), m.get('indices'), m.get('indptr')))
return np.array(f['track']), np.array(f['user']), plays
'1m' or '100k'.
Returns
-------
movies : ndarray
An array of the movie titles.
ratings : csr_matrix
A sparse matrix where the row is the movieId, the column is the userId and the value is
the rating.
"""
filename = "movielens_%s.hdf5" % variant
path = os.path.join(_download.LOCAL_CACHE_DIR, filename)
if not os.path.isfile(path):
log.info("Downloading dataset to '%s'", path)
_download.download_file(URL_BASE + filename, path)
else:
log.info("Using cached dataset at '%s'", path)
with h5py.File(path, 'r') as f:
m = f.get('movie_user_ratings')
plays = csr_matrix((m.get('data'), m.get('indices'), m.get('indptr')))
return np.array(f['movie']), plays
def get_lastfm():
""" Returns the lastfm360k dataset, downloading locally if necessary.
Returns a tuple of (artistids, userids, plays) where plays is a CSR matrix """
filename = os.path.join(_download.LOCAL_CACHE_DIR, "lastfm_360k.hdf5")
if not os.path.isfile(filename):
log.info("Downloading dataset to '%s'", filename)
_download.download_file(URL, filename)
else:
log.info("Using cached dataset at '%s'", filename)
with h5py.File(filename, 'r') as f:
m = f.get('artist_user_plays')
plays = csr_matrix((m.get('data'), m.get('indices'), m.get('indptr')))
return np.array(f['artist']), np.array(f['user']), plays