petermeissner
diff --git a/‎DESCRIPTION‎
Lines changed: 1 addition & 0 deletions b/‎DESCRIPTION‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 1 addition & 0 deletions b/‎NAMESPACE‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/diffr.R‎
Lines changed: 153 additions & 0 deletions b/‎R/diffr.R‎
Lines changed: 153 additions & 0 deletions
diff --git a/‎R/moc.R‎
Lines changed: 70 additions & 0 deletions b/‎R/moc.R‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎R/moc_helper.R‎
Lines changed: 27 additions & 2 deletions b/‎R/moc_helper.R‎
Lines changed: 27 additions & 2 deletions
diff --git a/‎dev.R‎
Lines changed: 3 additions & 70 deletions b/‎dev.R‎
Lines changed: 3 additions & 70 deletions
diff --git a/‎dev.cpp‎ b/‎dev.cpp‎
diff --git a/‎dev_save.R‎
Lines changed: 10 additions & 0 deletions b/‎dev_save.R‎
Lines changed: 10 additions & 0 deletions
@@ -39,6 +39,7 @@ Imports:
     data.table (>= 1.9.6),
     dtplyr (>= 0.0.1),
     Rcpp (>= 0.12.6),
+    stringdist (>= 0.9.4.1),
     stats,
     graphics
 Suggests:
 
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export("%>%")
+export(diffr)
 export(diffrproject)
 export(dp_text_base_data)
 export(moc_helper_easy_matches)
 
@@ -0,0 +1,153 @@
+#' FUNCTION_TITLE
+#'
+#' FUNCTION_DESCRIPTION
+#'
+#' @param text1 first text
+#' @param text2 second text
+#' @param tokenizer defaults to NULL which will trigger linewise tokenization;
+#'        accepts a function that turns a text into a token data frame;
+#'        a token data frame has at least three columns:
+#'        from (first character of token),
+#'        to (last character of token)
+#'        token (the token)
+#' @param ignore defaults to NULL which means that nothing is ignored;
+#'        function that accepts a token data frame (see above) and returns a
+#'        possibly subseted data frame of hte same form
+#' @param clean defaults to NULL which means that nothing cleaned; accepts a
+#'        function that takes a vector of tokens and returns a vector of same
+#'        length - potentially clean up
+#' @param distance defaults to Levenshtein ("lv"); see \link[stringdist]{amatch},
+#'        \link[stringdist]{stringdist-metrics}, \link[stringdist]{stringdist}
+#' @param ... further arguments passed through to distance function
+#'
+#' @return dataframe with tokens aligned according to distance
+#'
+#' @export
+diffr <- function(
+  text1     = NULL,
+  text2     = NULL,
+  tokenizer = NULL,
+  ignore    = NULL,
+  clean     = NULL,
+  distance  = c("lv", "osa", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex"),
+  ...
+){
+  # checking input
+  if( is.function(distance) ){ stop("using non standard distance functions is not implemented yet - sorry") }
+  stopifnot(
+    !is.null(text1),
+    is.character(text1),
+    !is.null(text2),
+    is.character(text2)
+  )
+
+  # assigning default options
+  if( is.null(tokenizer) ){ tokenizer <- stringb::text_tokenize_lines }
+  if( is.null(clean) ){     clean     <- function(x){x} }
+  if( is.null(ignore) ){    ignore    <- function(x){x} }
+  distance <- distance[1]
+
+  # tokenize
+  message(" - tokenizing text")
+  text1_tokenized <- tokenizer(text1)[1:3]
+  text1_tokenized$token_i <- seq_along(text1_tokenized$token)
+
+  text2_tokenized <- tokenizer(text2)[1:3]
+  text2_tokenized$token_i <- seq_along(text2_tokenized$token)
+
+  # clean
+  message(" - cleaning token")
+  text1_tokenized$token <- clean(text1_tokenized$token)
+  text2_tokenized$token <- clean(text2_tokenized$token)
+
+  # ignore
+  message(" - ignoring token")
+  text1_tokenized_prei <- text1_tokenized
+  text2_tokenized_prei <- text2_tokenized
+  text1_tokenized <- ignore(text1_tokenized)
+  text2_tokenized <- ignore(text2_tokenized)
+
+  # alignment and distances
+  message(" - doing distance calculation and alignment")
+
+  text1_tokenized <- setNames(text1_tokenized, c("from_1", "to_1", "token_1", "token_i_1"))
+  text2_tokenized <- setNames(text2_tokenized, c("from_2", "to_2", "token_2", "token_2_1"))
+
+  # distance
+  a <-
+    stringdist::amatch(
+      text1_tokenized$token_1,
+      text2_tokenized$token_2,
+      method  = distance,
+      ...
+    )
+
+  # alignment
+  alignment <-
+    data.frame(
+      text1_tokenized,
+      text2_tokenized[a, ]
+    )
+
+  alignment$distance <-
+    stringdist::stringdist(
+      alignment$token_1,
+      alignment$token_2,
+      method = distance
+    )
+
+  # type and distances
+  alignment$type <- ""
+  alignment$type[alignment$distance == 0]<-"no-change"
+  alignment$type[alignment$distance >  0]<-"change"
+
+  iffer <- is.na(alignment$token_1)
+  alignment[iffer, "type"]     <- "insertion"
+  alignment[iffer, "distance"] <- stringdist::stringdist("", alignment[iffer, "token_2"])
+
+  iffer <- is.na(alignment$token_2)
+  alignment[iffer, "type"]     <- "deletion"
+  alignment[iffer, "distance"] <- stringdist::stringdist("", alignment[iffer, "token_1"])
+
+  # non matches
+  tmp <-
+    subset(
+      cbind(text1_tokenized, type="ignored"),
+      !(text1_tokenized$token_i_1 %in% alignment$token_i_1)
+    )
+  alignment <-
+    rtext:::rbind_fill(alignment, tmp)
+
+  tmp <-
+    subset(
+      cbind(text2_tokenized, type="ignored"),
+      !(text2_tokenized$token_i_2 %in% alignment$token_i_2)
+    )
+  alignment <-
+    rtext:::rbind_fill(alignment, tmp)
+
+  # return
+  return(alignment)
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -0,0 +1,70 @@
+#' stub
+#' @keywords internal
+moc <- function(
+  text1     = NULL,
+  text2     = NULL,
+  tokenizer = function(text){text_tokenize_lines(text)},
+  ignore    = function(...){FALSE},
+  clean     = function(token){token},
+  distance  = function(token1, token2){matrix(0,nrow = length(token1), ncol = length(token2))},
+  alignment = function(m){}
+){
+  # alignment and distances
+
+  #### trivial matches -- unique equal token matches
+  message(" - trivial matching")
+  res <-
+    moc_helper_trivial_matches( tt1 = text1_tokenized, tt2 = text2_tokenized )
+
+
+  #### easy matches -- text1 non-unique equal token matches
+  message(" - easy matching 1")
+  res <-
+    rbind(
+      res,
+      moc_helper_easy_matches( tt1 = text1_tokenized, tt2 = text2_tokenized, res= res, type=1)
+    )
+
+
+  #### easy matches -- text2 non-unique equal token matches
+  message(" - easy matching 2")
+  res <-
+    rbind(
+      res,
+      moc_helper_easy_matches( tt1 = text1_tokenized, tt2 = text2_tokenized, res= res, type=2)
+    )
+
+  #### easy matches -- text2 non-unique equal token matches
+  message(" - easy matching 3")
+
+  # prepare tt1 and tt2 as lists of data.frames
+  tt1 <-
+    text1_tokenized %>%
+    filter( !(token_i %in% res$token_i_1) )
+
+  tt2 <-
+    text2_tokenized %>%
+    filter( !(token_i %in% res$token_i_2) )
+
+  tt1_split <- split_tt_by_length(tt1)
+  tt2_split <- split_tt_by_length(tt2)
+
+  tt_names <- unique(c(names(tt1_split), names(tt2_split)))
+
+  # do the matches
+  for( i in rev(seq_along(tt_names)) ) {
+    cat(i, " ", append=TRUE)
+    res <-
+      moc_helper_easy_matches(
+        tt1 = tt1_split[[tt_names[i]]],
+        tt2 = tt2_split[[tt_names[i]]],
+        res=res,
+        type=3
+      )
+  }
+  cat("\n")
+
+  # finishing matching of no-change type
+  res$type <- "no-change"
+  res$diff <- 0
+}
@@ -1,6 +1,31 @@
+#' splitting a tokenized text
+#' @param tt tokenized text
+#' @keywords internal
+split_tt_by_length <- function(tt){
+  tt %>%
+    dplyr::mutate(
+      token_length = nchar(token)
+    ) %>%
+    split(
+      .$token_length
+    ) %>%
+    lapply(
+      dplyr::mutate,
+      token_length = NULL
+    ) %>%
+    lapply(
+      as.data.table
+    ) %>%
+    lapply(
+      setkey,
+      token, token_i
+    )
+}
+
+
 #' trivial matches
 #'
-#' merthod of comparison helper function
+#' method of comparison helper function
 #' @param tt1 tokenized text number 1
 #' @param tt2 tokenized text number 2
 #' @export
@@ -89,7 +114,7 @@ moc_helper_easy_matches <- function(tt1, tt2, res, type=c(1,2), fullreturn=TRUE)
 
   # return
   if( fullreturn ){
-    return(rbind(res,chosen))
+    return( rbind(res, data.table(chosen), fill=TRUE) )
   }else{
     return(chosen)
   }
 
@@ -1,89 +1,22 @@
 #### ---------------------------------------------------------------------------
 
 library(diffrprojects)
-is_unique <- diffrprojects:::is_unique
-is_minimum <- diffrprojects:::is_minimum
-dim1 <- diffrprojects:::dim1
-which_dist_min_absolute <- diffrprojects:::which_dist_min_absolute
-choose_options <- diffrprojects:::choose_options
-split_tt_by_length <- diffrprojects:::split_tt_by_length
-
-
-library(dplyr)
-library(data.table)
-library(dtplyr)
-library(Rcpp)
-
-
-
-
 
 #### ---------------------------------------------------------------------------
 
 text_path  <- "~/Dropbox/IDEP_Database/rawdata/AUT/txts"
 
 text_files <- list.files(text_path, pattern = "txt", full.names = TRUE)
 
-text1 <- rtext$new(text_file=text_files[13], encoding="latin1")$text_get()
-text2 <- rtext$new(text_file=text_files[14], encoding="latin1")$text_get()
+text1 <- rtext$new(text_file=text_files[13], encoding="latin1")$text_get(2000)
+text2 <- rtext$new(text_file=text_files[14], encoding="latin1")$text_get(2000)
 
 #text1 <- rtext$new(text_file=stringb:::test_file("rc_2.txt"))$text_get()
 #text2 <- rtext$new(text_file=stringb:::test_file("rc_3.txt"))$text_get()
 
-tokenizer <- text_tokenize_words
-ignore    = function(...){FALSE}
-clean     = function(token){token}
-distance  = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex")
-
 #### ---------------------------------------------------------------------------
 
-
-diffr <- function(
-  text1     = NULL,
-  text2     = NULL,
-  tokenizer = function(text){text_tokenize_lines(text)},
-  ignore    = NULL,
-  clean     = NULL,
-  distance  = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex")
-){}
-
-  # tokenize
-  message(" - tokenizing text")
-  text1_tokenized <- tokenizer(text1)[1:3]
-  text1_tokenized$token_i <- seq_along(text1_tokenized$token)
-
-  text2_tokenized <- tokenizer(text2)[1:3]
-  text2_tokenized$token_i <- seq_along(text2_tokenized$token)
-
-  # clean
-  if( !is.null(clean) ){
-    message(" - cleaning token")
-    text1_tokenized$token <- clean(text1_tokenized$token)
-    text2_tokenized$token <- clean(text2_tokenized$token)
-  }
-
-
-  # ignore
-  if( !is.null(ignore) ){
-    message(" - ignoring token")
-    text1_tokenized <- ignore(text1_tokenized)
-    text2_tokenized <- ignore(text2_tokenized)
-  }
-
-
-  # alignment and distances
-  if( is.character(distance) ){
-    message(" - doing distance calculation and alignment")
-    a <- stringdist::amatch(text1_tokenized$token, text2_tokenized$token, method=distance)
-    alignment <- data.frame(text1_tokenized, text2_tokenized[a, ])
-  }else{
-    stop("using non standard distance functions is not implemented yet - sorry")
-  }
-
-
-
-
-
+diffr(text1, text2)
 
 
 
 
@@ -149,6 +149,16 @@ moc <- function(
 
 
 
+  # alignment via hungarian solution to assignment problem
+
+  library(clue)
+
+  m <- adist(tt1$token, tt2$token)
+  solution_index_v <- as.numeric(solve_LSAP(m))
+  solution_index_m <- as.matrix(cbind(seq_along(solution_index_v),solution_index_v))
+
+  aligned <- cbind(tt1,tt2[solution_index,], dist = m [solution_index_m])
+