logic diary: data consistency test via R Studio
Logic to verify data coherence between Views from Data Lake that are 'In progress' and data extracted from SAP QAS
as a big fan of R, I decided to use it to verify the data consistency on R Studio this time.
solution!
- read the CSV files from both sources using the - read.csv2function suitable for files with semicolon-separated values.
- removed duplicate values from each column in both files and exported the unique values to new CSV files. This ensured we worked with clean and unique data for each column. 
- for each column, I compared the unique values from the view and sap files. I used the - setdifffunction to identify values in the SAP file but not in the View file.
- remain values were exported to new CSV files with the suffix - _diff.csv. If no differences were found, a message was displayed in the console indicating that: “no different values were found”.
# Set working directory
setwd("")
# Read the vw file
vw_OTs <- read.csv2("view.csv", header=TRUE, stringsAsFactors=FALSE, fileEncoding="latin1")
head(vw_OTs)
# Read the sap file
sapdata <- read.csv2("sap.csv", header=TRUE, stringsAsFactors=FALSE, fileEncoding="latin1")
head(sapdata)
# Initialize lists to store file names
viewfiles <- c()
sapfiles <- c()
# Remove duplicates and export a CSV for each column (vw)
for (column in names(vw_OTs)) {
  unique_values <- unique(vw_OTs[[column]])
  df_unique_values <- data.frame(unique_values)
  viewfile_name <- paste0(column, "_unique.csv")
  write.csv(df_unique_values, viewfile_name, row.names = FALSE)
  viewfiles <- c(viewfiles, viewfile_name)
}
# Remove duplicates and export a CSV for each column (sap)
for (column in names(sapdata)) {
  unique_values <- unique(sapdata[[column]])
  df_unique_values <- data.frame(unique_values)
  sapfile_name <- paste0(column, "_unique.csv")
  write.csv(df_unique_values, sapfile_name, row.names = FALSE)
  sapfiles <- c(sapfiles, sapfile_name)
}
column_names <- names(vw_OTs)
# Function to compare columns of two CSV files and return unique values
compare_csv_columns <- function(sapfile, viewfile, column_name) {
  # Read the CSV files
  df1 <- read.csv(sapfile, header=TRUE, stringsAsFactors = FALSE)
  df2 <- read.csv(viewfile, header=TRUE, stringsAsFactors = FALSE)
  
  # Get unique values from the specified column in each file
  unique_values_file1 <- unique(df1[[column_name]])
  unique_values_file2 <- unique(df2[[column_name]])
  
  # Find values that are in sap file but not in view file
  diff_values <- setdiff(unique_values_file1, unique_values_file2)
  
  if (length(diff_values) == 0) {
    cat("No different values found for column:", column_name, "\n")
  } else {
    # Convert the result to a data frame
    diff_df <- data.frame(diff_values)
    
    # Save the result to a new CSV file
    write.csv(diff_df, paste0(column_name, "_diff.csv"), row.names = FALSE)
  }
}
# Compare the columns of both files
for (i in 1:length(column_names)) {
  compare_csv_columns(sapfiles[i], viewfiles[i], column_names[i])
}be nice!
i would love to hear your thoughts on this logic! or feel free to join the discussion around this topic!
also check my github!

