Skip to content

Commit

Permalink
Create ml10m.R
Browse files Browse the repository at this point in the history
  • Loading branch information
statwangz committed Sep 20, 2023
1 parent 0a34eb1 commit 80bd570
Showing 1 changed file with 81 additions and 0 deletions.
81 changes: 81 additions & 0 deletions data-raw/ml10m.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
### Code to prepare `ml10m` dataset goes here

library(Matrix)
library(stringr)

raw_data <- tempfile()
download.file(
"https://files.grouplens.org/datasets/movielens/ml-10m.zip", raw_data
)

## Movie rating data

# UserID::MovieID::Rating::Timestamp
ratings <- read.table(
text = readLines(unzip(raw_data, "ml-10M100K/ratings.dat")),
sep = ":", header = FALSE,
colClasses = c(
"integer", "NULL", "integer", "NULL",
"numeric", "NULL", "integer"
)
)
# head(ratings)
# dim(ratings)
all_UserID <- paste0("user_", sort(unique(ratings[, 1])))
all_MovieID <- paste0("movie_", sort(unique(ratings[, 2])))
ratings[, 1] <- paste0("user_", ratings[, 1])
ratings[, 2] <- paste0("movie_", ratings[, 2])

N <- length(all_UserID)
M <- length(all_MovieID)

rating_matrix <- Matrix::sparseMatrix(
i = match(ratings[, 1], all_UserID), # Row indices
j = match(ratings[, 2], all_MovieID), # Column indices
x = ratings[, 3],
dims = c(N, M),
symmetric = FALSE, triangular = FALSE,
index1 = TRUE
)
# head(rating_matrix)

## Movies file description

# MovieID::Title::Genres
movies <- stringr::str_split_fixed(
readLines(unzip(raw_data, "ml-10M100K/movies.dat")),
pattern = "::", n = 3
)
# head(movies)
movies[, 1] <- paste0("movie_", movies[, 1])
movies <- movies[movies[, 1] %in% all_MovieID, ]

all_genres <- c(
"Action", "Adventure", "Animation", "Children's",
"Comedy", "Crime", "Documentary", "Drama",
"Fantasy", "Film-Noir", "Horror", "Musical",
"Mystery", "Romance", "Sci-Fi", "Thriller",
"War", "Western"
)

# Create the movie-genre matrix for rows (# of movies) and columns (# of genres)
movie_genre_matrix <- t(sapply(
movies[, 3],
FUN = function(x) {
as.integer(all_genres %in%
unlist(strsplit(x, split = "|", fixed = TRUE))
)
},
USE.NAMES = FALSE
))
colnames(movie_genre_matrix) <- all_genres

ml10m <- list(
rating = rating_matrix,
genre = movie_genre_matrix
)

# Save the compressed data
usethis::use_data(ml10m, overwrite = TRUE, compress = "xz")
# # Compress the data
# tools::resaveRdaFiles(paths = "data/ml10m.rda")

0 comments on commit 80bd570

Please sign in to comment.