petermeissner
diff --git a/‎.Rbuildignore‎
Lines changed: 1 addition & 0 deletions b/‎.Rbuildignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 6 additions & 6 deletions b/‎.gitignore‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎DESCRIPTION‎
Lines changed: 2 additions & 4 deletions b/‎DESCRIPTION‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 1 addition & 1 deletion b/‎NAMESPACE‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎NEWS.md‎
Lines changed: 17 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎R/diffr.R‎
Lines changed: 115 additions & 24 deletions b/‎R/diffr.R‎
Lines changed: 115 additions & 24 deletions
diff --git a/‎R/imports.R‎
Lines changed: 0 additions & 14 deletions b/‎R/imports.R‎
Lines changed: 0 additions & 14 deletions
@@ -8,3 +8,4 @@ todo.txt
 dev.R
 benchmarks/*
 meta.R
+dev_save.R
@@ -1,6 +1,6 @@
-.Rproj.user
-.Rhistory
-.RData
-.Ruserdata
-inst/doc
-README.html
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
+inst/doc
+README.html
@@ -1,7 +1,7 @@
 Package: diffrprojects
 Title: Using diffr for more than two files
-Date: 2016-08-01
-Version: 0.1.2.90000
+Date: 2016-08-26
+Version: 0.1.3.90000
 Authors@R: c(
   person(
     "Peter", "Meissner",
@@ -33,8 +33,6 @@ LazyData: TRUE
 Imports:
     R6 (>= 2.1.2),
     hellno (>= 0.0.1),
-    magrittr (>= 1.5),
-    digest (>= 0.6.9),
     dplyr(>= 0.5.0),
     data.table (>= 1.9.6),
     dtplyr (>= 0.0.1),
 
@@ -1,7 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export("%>%")
-export(diffr)
+export(diff_align)
 export(diffrproject)
 export(dp_text_base_data)
 export(moc_helper_easy_matches)
 
@@ -1,6 +1,22 @@
 NEWS diffrprojects
 ==========================================================================
 
+
+version 0.1.2 // 2016-08-26 ... 
+--------------------------------------------------------------------------
+
+* BUGFIXES
+
+
+    
+* FEATURE
+    - tests tests tests
+    
+
+* DEVELOPMENT
+
+
+
 version 0.1.2 // 2016-06-09 ... 
 --------------------------------------------------------------------------
 
@@ -9,6 +25,7 @@ version 0.1.2 // 2016-06-09 ...
 
 
 * FEATURE
+    - diff_align()
 
 
 * DEVELOPMENT
 
@@ -1,6 +1,7 @@
-#' FUNCTION_TITLE
+#' algining texts
 #'
-#' FUNCTION_DESCRIPTION
+#' Function aligns two texts side by side as a data.frame with change type and
+#' distance given as well
 #'
 #' @param text1 first text
 #' @param text2 second text
@@ -19,17 +20,24 @@
 #' @param distance defaults to Levenshtein ("lv"); see \link[stringdist]{amatch},
 #'        \link[stringdist]{stringdist-metrics}, \link[stringdist]{stringdist}
 #' @param ... further arguments passed through to distance function
+#' @inheritParams stringdist::stringdist
 #'
 #' @return dataframe with tokens aligned according to distance
 #'
 #' @export
-diffr <- function(
+diff_align <- function(
   text1     = NULL,
   text2     = NULL,
   tokenizer = NULL,
   ignore    = NULL,
   clean     = NULL,
   distance  = c("lv", "osa", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex"),
+  useBytes = FALSE,
+  weight = c(d = 1, i = 1, s = 1, t = 1),
+  maxDist = Inf,
+  q = 1,
+  p = 0,
+  nthread = getOption("sd_num_thread"),
   ...
 ){
   # checking input
@@ -45,6 +53,8 @@ diffr <- function(
   if( is.null(tokenizer) ){ tokenizer <- stringb::text_tokenize_lines }
   if( is.null(clean) ){     clean     <- function(x){x} }
   if( is.null(ignore) ){    ignore    <- function(x){x} }
+  if( length(text1) > 1){ text1 <- text_collapse(text1) }
+  if( length(text2) > 1){ text2 <- text_collapse(text2) }
   distance <- distance[1]
 
   # tokenize
@@ -67,19 +77,27 @@ diffr <- function(
   text1_tokenized <- ignore(text1_tokenized)
   text2_tokenized <- ignore(text2_tokenized)
 
+  # column naming
+  text1_tokenized_prei <- stats::setNames(text1_tokenized_prei, c("from_1", "to_1", "token_1", "token_i_1"))
+  text2_tokenized_prei <- stats::setNames(text2_tokenized_prei, c("from_2", "to_2", "token_2", "token_i_2"))
+  text1_tokenized <- stats::setNames(text1_tokenized, c("from_1", "to_1", "token_1", "token_i_1"))
+  text2_tokenized <- stats::setNames(text2_tokenized, c("from_2", "to_2", "token_2", "token_i_2"))
+
   # alignment and distances
   message(" - doing distance calculation and alignment")
 
-  text1_tokenized <- setNames(text1_tokenized, c("from_1", "to_1", "token_1", "token_i_1"))
-  text2_tokenized <- setNames(text2_tokenized, c("from_2", "to_2", "token_2", "token_2_1"))
-
   # distance
   a <-
     stringdist::amatch(
       text1_tokenized$token_1,
       text2_tokenized$token_2,
       method  = distance,
-      ...
+      useBytes = useBytes,
+      weight = weight,
+      maxDist = maxDist,
+      q = q,
+      p = p,
+      nthread = nthread
     )
 
   # alignment
@@ -93,40 +111,113 @@ diffr <- function(
     stringdist::stringdist(
       alignment$token_1,
       alignment$token_2,
-      method = distance
+      method  = distance,
+      useBytes = useBytes,
+      weight = weight,
+      q = q,
+      p = p,
+      nthread = nthread
     )
 
   # type and distances
-  alignment$type <- ""
-  alignment$type[alignment$distance == 0]<-"no-change"
-  alignment$type[alignment$distance >  0]<-"change"
+  if( dim1(alignment) > 0 ){
+    alignment$type <- ""
+    alignment$type[alignment$distance == 0]<-"no-change"
+    alignment$type[alignment$distance >  0]<-"change"
 
-  iffer <- is.na(alignment$token_1)
-  alignment[iffer, "type"]     <- "insertion"
-  alignment[iffer, "distance"] <- stringdist::stringdist("", alignment[iffer, "token_2"])
+  alignment <-
+    rtext:::rbind_fill(
+      alignment,
+      text1_tokenized[
+        !(text1_tokenized$token_i_1 %in% alignment$token_i_1),
+        ]
+    )
+
+  alignment <-
+    rtext:::rbind_fill(
+      alignment,
+      text2_tokenized[
+        !(text2_tokenized$token_i_2 %in% alignment$token_i_2),
+        ]
+    )
 
   iffer <- is.na(alignment$token_2)
   alignment[iffer, "type"]     <- "deletion"
-  alignment[iffer, "distance"] <- stringdist::stringdist("", alignment[iffer, "token_1"])
+  alignment[iffer, "distance"] <-
+    stringdist::stringdist(
+      "",
+      alignment[iffer, "token_1"],
+      method  = distance,
+      useBytes = useBytes,
+      weight = weight,
+      q = q,
+      p = p,
+      nthread = nthread
+    )
 
-  # non matches
-  tmp <-
-    subset(
-      cbind(text1_tokenized, type="ignored"),
-      !(text1_tokenized$token_i_1 %in% alignment$token_i_1)
+  iffer <- is.na(alignment$token_1)
+  alignment[iffer, "type"]     <- "insertion"
+  alignment[iffer, "distance"] <-
+    stringdist::stringdist(
+      "",
+      alignment[iffer, "token_2"],
+      method  = distance,
+      useBytes = useBytes,
+      weight = weight,
+      q = q,
+      p = p,
+      nthread = nthread
     )
-  alignment <-
-    rtext:::rbind_fill(alignment, tmp)
 
+  alignment$token_1 <-
+    dplyr::left_join(
+      subset(alignment, TRUE, token_i_1),
+      subset(text1_tokenized_prei, TRUE, c(token_i_1, token_1) ),
+      by=c("token_i_1"="token_i_1")
+    )$token_1
+
+  alignment$token_2 <-
+    dplyr::left_join(
+      subset(alignment, TRUE, token_i_2),
+      subset(text2_tokenized_prei, TRUE, c(token_i_2, token_2) ),
+      by=c("token_i_2"="token_i_2")
+    )$token_2
+  }
+
+  # non matches
+  if( dim1(text1_tokenized_prei)>0 ){
+    tmp <-
+      subset(
+        cbind(text1_tokenized_prei, type="ignored"),
+        !(text1_tokenized_prei$token_i_2 %in% alignment$token_i_1)
+      )
+    alignment <-
+      rtext:::rbind_fill(alignment, tmp)
+  }
+
+  if( dim1(text2_tokenized_prei)>0 ){
   tmp <-
     subset(
-      cbind(text2_tokenized, type="ignored"),
-      !(text2_tokenized$token_i_2 %in% alignment$token_i_2)
+      cbind(text2_tokenized_prei, type="ignored"),
+      !(text2_tokenized_prei$token_i_2 %in% alignment$token_i_2)
     )
   alignment <-
     rtext:::rbind_fill(alignment, tmp)
+  }
 
   # return
+  if( !("type" %in% names(alignment)) ){
+    alignment <- cbind(alignment, type=character(0))
+  }
+  alignment <-
+    subset(
+      alignment,
+      select=c(
+        token_i_1, token_i_2, distance, type,
+        from_1, to_1, from_2, to_2,
+        token_1,  token_2
+      )
+    )
   return(alignment)
 }