Skip to content

Commit

Permalink
Merge pull request #1 from Randul-Malinhara/add-initial-files
Browse files Browse the repository at this point in the history
Add the scripts, datasets
  • Loading branch information
Randul-Malinhara authored Dec 18, 2024
2 parents 4302742 + dd12192 commit bd86e92
Show file tree
Hide file tree
Showing 9 changed files with 286 additions and 0 deletions.
110 changes: 110 additions & 0 deletions 18098293_energyForecasting.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# Import libraries
library(readxl)
library(neuralnet)
library(caTools)
library(MLmetrics)
library(Metrics)
library(xlsx)

# Build the Assumed Values and real Values table
predictedVsreal <- function (assumed, mlp_test) {
assumedTest <- cbind(assumed, as.data.frame(mlp_test$net.result))
colnames(assumedTest) <- c("Expected Output", "NeuralNetwork Output")
return(assumedTest)
}

# Evaluation Function
rmse <- function (real, predicted)
{
RMSE <- sqrt(mean((real - predicted)^2))
return(RMSE)
}
evaluationFunction <- function(real,predict) {
rmse_mlp <- rmse(real = real, predicted = predict)
mae_mlp <- Metrics::mae(real = real, predicted = predict)
mape_mlp <- MAPE(y_pred = predict, y_true = real)
return(c(rmse_mlp, mae_mlp, mape_mlp))
}

# Import Processed Data set
powerOutage <- read_xlsx("C:/Users/RANDUL/Desktop/2nd Semester of 2nd Year/5DATA001C.2 Machine Learning and Data Mining/CW/scaled_powerusage_10.xlsx")

# # For 9th Hour and 10th Hour
# (Note: Run the both file individually to get the input as 9th hour and 10th hour)
# powerOutage <- read_xlsx("C:/Users/RANDUL/Desktop/2nd Semester of 2nd Year/5DATA001C.2 Machine Learning and Data Mining/CW/powerUsage-scaled-09.xlsx")
# powerOutage <- read_xlsx("C:/Users/RANDUL/Desktop/2nd Semester of 2nd Year/5DATA001C.2 Machine Learning and Data Mining/CW/powerUsage-scaled-10.xlsx")

# Split Data
set.seed(123)
splitRule <- sample(seq_len(nrow(powerOutage)), size = 430)
train <- powerOutage[splitRule, ]
test <- powerOutage[-splitRule, ]

# Define X values of test data
x_test <- test[-1]

# Define Y values of test data
y_test <- test[1]

# MLP NN - Configuration 1
relation1 <- as.formula("wineD_original_data~v2+v3+v4+v5+v6")
mlp1 <- neuralnet(formula = relation1, data = train,hidden = c(4,3), stepmax = 1e+10)
plot(mlp1)

# Make prediction on X test data
y_pred1 <- neuralnet::compute(mlp1, x_test)
evaluationData1 <- predictedVsreal(y_test, y_pred1)
mlpTestResult1 <- evaluationFunction(real = evaluationData1$`Expected Output`, predict = evaluationData1$`NeuralNetwork Output`)

# MLP NN - Configuration 2
relation2 <- as.formula("wineD_original_data~v2+v3+v4+v5+v6")
mlp2 <- neuralnet(formula = relation2, data = train,hidden = c(4,3), stepmax = 1e+10, learningrate = 0.001)
plot(mlp2)

# Make prediction on X test data
y_pred2 <- neuralnet::compute(mlp2, x_test)
evaluationData2 <- predictedVsreal(y_test, y_pred2)
mlpTestResult2 <- evaluationFunction(real = evaluationData2$`Expected Output`, predict = evaluationData2$`NeuralNetwork Output`)

# MLP NN - Configuration 3
relation3 <- as.formula("wineD_original_data~v2+v3+v4+v5+v6")
mlp3 <- neuralnet(formula = relation3, data = train,hidden = c(3,2), stepmax = 1e+10)
plot(mlp3)

# Make prediction on X test data
y_pred3 <- neuralnet::compute(mlp3, x_test)
evaluationData3 <- predictedVsreal(y_test, y_pred3)
mlpTestResult3 <- evaluationFunction(real = evaluationData3$`Expected Output`, predict = evaluationData3$`NeuralNetwork Output`)

# MLP NN - Configuration 4
relation4 <- as.formula("wineD_original_data~v2+v3+v4+v5+v6")
mlp4 <- neuralnet(formula = relation4, data = train,hidden = c(5,2), stepmax = 1e+10, learningrate = 0.001)
plot(mlp4)

# Make prediction on X test data
y_pred4 <- neuralnet::compute(mlp4, x_test)
evaluationData4 <- predictedVsreal(y_test, y_pred4)
mlpTestResult4 <- evaluationFunction(real = evaluationData4$`Expected Output`, predict = evaluationData4$`NeuralNetwork Output`)

# MLP NN - Configuration 5
train0 <- train[1:4]
relation5 <- as.formula("wineD_original_data~v2+v3+v4")
mlp5 <- neuralnet(formula = relation5, data = train0,hidden = c(4,3), stepmax = 1e+10, learningrate = 0.001)
plot(mlp5)

x_test0 <- test[2:4]

# Make prediction on X test data
# (Note: First try with configuration 1 and run this 98:100 and then 2, 3, 4, 5)
y_pred5 <- neuralnet::compute(mlp5, x_test0)
evaluationData5 <- predictedVsreal(y_test, y_pred5)
mlpTestResult5 <- evaluationFunction(real = evaluationData5$`Expected Output`, predict = evaluationData5$`NeuralNetwork Output`)

comp <- rbind(mlpTestResult1, mlpTestResult2, mlpTestResult3, mlpTestResult4, mlpTestResult5)
colnames(comp) <- c("RMSE","MAE","MAPE")
rownames(comp) <- c("config-1","config-2","config-3","config-4","config-5")
comp

write.xlsx(comp,
file = "C:/Users/RANDUL/Desktop/2nd Semester of 2nd Year/5DATA001C.2 Machine Learning and Data Mining/CW/comparison-powerusage.xlsx"
,col.names = TRUE, append = TRUE, row.names = TRUE)
131 changes: 131 additions & 0 deletions 18098293_p_clustering.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# import libraries
library(readxl)
library(fpc)
library(NbClust)
library(dplyr)
library(MASS)
library(caret)
library(flexclust)


# Read the File of White wine_v2
wineD <- read_xlsx("C:/Users/RANDUL/Desktop/2nd Semester of 2nd Year/5DATA001C.2 Machine Learning and Data Mining/CW/Whitewine_v2.xlsx")
boxplot(wineD)

# outliers removal
wineD_summary <- summary(wineD$`residual sugar`)
wineD_summary

# Estimate interquartile range
# (3rd interquartile minus 1st interquartile)
interqr <- wineD_summary[[5]] - wineD_summary[[2]]


# Identifying the bounds for outliers
lower_limit <- wineD_summary[[2]] - (1.5 * interqr)
upper_limit <- wineD_summary[[5]] + (1.5 * interqr)

# Identifying the outliers
outliers <- wineD %>%
filter(`residual sugar`> upper_limit | `residual sugar`< lower_limit)

# Outliers are removed from the data frame, but a new dataframe called "no outliers" is created.
no_outliers <- wineD %>%
filter(`residual sugar` < upper_limit & `residual sugar` > lower_limit)

wineD_summary <- summary(wineD$`free sulfur dioxide`)
interqr <- wineD_summary[[5]] - wineD_summary[[2]]

# The bounds are established with the wineD data
lower_limit <- wineD_summary[[2]] - (1.5 * interqr)
upper_limit <- wineD_summary[[5]] + (1.5 * interqr)

outliers <- rbind(outliers,wineD %>%
filter(`free sulfur dioxide` > upper_limit |
`free sulfur dioxide` < lower_limit))

no_outliers <- no_outliers %>%
filter(`free sulfur dioxide` < upper_limit & `free sulfur dioxide` > lower_limit)



# Repeat for fixed acidity
wineD_summary <- summary(wineD$`total sulfur dioxide`)
interqr <- wineD_summary[[5]] - wineD_summary[[2]]

# Remember that the bounds are based on the wineD data
lower_limit <- wineD_summary[[2]] - (1.5 * interqr)
upper_limit <- wineD_summary[[5]] + (1.5 * interqr)

# Removing fixed acidity outliers from the no_outliers data, not the wineD
outliers <- rbind(outliers,wineD %>%
filter(`total sulfur dioxide` > upper_limit |
`total sulfur dioxide` < lower_limit))

no_outliers <- no_outliers %>%
filter(`total sulfur dioxide` < upper_limit & `total sulfur dioxide` > lower_limit)

boxplot(no_outliers)



# Scaling
wine_stand<- scale(no_outliers[-12])
summary(wine_stand)

# funct NbClust()
set.seed(1234)
nc <- NbClust(wine_stand,
min.nc=2, max.nc=8,
method="kmeans")

barplot(table(nc$Best.n[1,]),
xlab="Total Number of Clusters",
ylab="Total Number of Criterias",
main="Total Number of Clusters Chosen by 9 Criteria")

ws <- 0
for (i in 1:9){
ws[i] <-
sum(kmeans(wine_stand, centers=i)$withinss)}

plot(1:9,
ws,
type="b",
xlab="Total Number of Clusters",
ylab="Within groups sum of squares")

# k means = 2
fit.km2 <- kmeans(wine_stand,2)
plotcluster(wine_stand, fit.km2$cluster)
confuse <- table(no_outliers$quality,fit.km2$cluster)
confuse
parcoord(wine_stand, fit.km2$cluster)

# k means = 3
fit.km3 <- kmeans(wine_stand , 3)
fit.km3
confuse3 <- table(no_outliers$quality,fit.km3$cluster)
confuse
parcoord(wine_stand, fit.km2$cluster)

# k means = 4
fit.km4 <- kmeans(wine_stand, 4)
table(no_outliers$quality,fit.km4$cluster)
confuse
parcoord(wine_stand, fit.km2$cluster)

# k means = 5
fit.km5 <- kmeans(wine_stand, 5)
table(no_outliers$quality,fit.km5$cluster)
confuse
parcoord(wine_stand, fit.km2$cluster)

plotcluster(wine_stand,fit.km5$cluster)

# Evaluation with ARI for k=2
randIndex(confuse)

# NbClust() with Manhattan distance
clusters_manhattan <- NbClust(wine_stand,distance="manhattan",min.nc=2,max.nc=5,method="kmeans",
index="all")
45 changes: 45 additions & 0 deletions 18098293_pre-process.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Import libraries
library(tidyverse)
library(readxl)
library(tidymodels)
library(readxl)
library(caTools)
library(xlsx)

# --------------- PREPARATION OF DATA BEFRORE APPROACH --------------- #
# Load Data
power_usage <- read_excel("C:/Users/RANDUL/Desktop/2nd Semester of 2nd Year/5DATA001C.2 Machine Learning and Data Mining/CW/UoW_load.xlsx")
View(power_usage)

#09:00 , 10: 00, 11:00
power_usage <- power_usage$`11:00`
View(power_usage)
str(power_usage)
plot(power_usage)

# Build the Partial Auto Correlation plot
pacf(x = power_usage, plot = TRUE)

# Create Lags for Time Series
lag1 = lag(power_usage,1)
lag2 = lag(power_usage,2)
lag3 = lag(power_usage,3)
lag4 = lag(power_usage,4)
lag5 = lag(power_usage,5)
power_usage <- cbind(power_usage,lag1,lag2,lag3,lag4,lag5)

# Formatting power_usage
power_usage <- na.omit(power_usage)
sum(is.na(power_usage))

colnames(power_usage) <- c("wineD_original_data","v2","v3","v4","v5","v6")

# Scaling Data
scaled_poweruse <- scale(power_usage)
str(scaled_poweruse)

# Save the processed data to excel file
# For Excel files separately 09:00 , 10:00, 11:00 hours
write.xlsx(scaled_poweruse,
file = "C:/Users/RANDUL/Desktop/2nd Semester of 2nd Year/5DATA001C.2 Machine Learning and Data Mining/CW/scaled_powerusage_09.xlsx",
col.names = TRUE, append = TRUE, row.names = FALSE)
Binary file added UoW_load.xlsx
Binary file not shown.
Binary file added comparison-powerusage.xlsx
Binary file not shown.
Binary file added scaled_powerusage_09.xlsx
Binary file not shown.
Binary file added scaled_powerusage_10.xlsx
Binary file not shown.
Binary file added scaled_powerusage_11.xlsx
Binary file not shown.
Binary file added whitewine_v2.xlsx
Binary file not shown.

0 comments on commit bd86e92

Please sign in to comment.