|
| 1 | +#' FUNCTION_TITLE |
| 2 | +#' |
| 3 | +#' FUNCTION_DESCRIPTION |
| 4 | +#' |
| 5 | +#' @param text1 first text |
| 6 | +#' @param text2 second text |
| 7 | +#' @param tokenizer defaults to NULL which will trigger linewise tokenization; |
| 8 | +#' accepts a function that turns a text into a token data frame; |
| 9 | +#' a token data frame has at least three columns: |
| 10 | +#' from (first character of token), |
| 11 | +#' to (last character of token) |
| 12 | +#' token (the token) |
| 13 | +#' @param ignore defaults to NULL which means that nothing is ignored; |
| 14 | +#' function that accepts a token data frame (see above) and returns a |
| 15 | +#' possibly subseted data frame of hte same form |
| 16 | +#' @param clean defaults to NULL which means that nothing cleaned; accepts a |
| 17 | +#' function that takes a vector of tokens and returns a vector of same |
| 18 | +#' length - potentially clean up |
| 19 | +#' @param distance defaults to Levenshtein ("lv"); see \link[stringdist]{amatch}, |
| 20 | +#' \link[stringdist]{stringdist-metrics}, \link[stringdist]{stringdist} |
| 21 | +#' @param ... further arguments passed through to distance function |
| 22 | +#' |
| 23 | +#' @return dataframe with tokens aligned according to distance |
| 24 | +#' |
| 25 | +#' @export |
| 26 | +diffr <- function( |
| 27 | + text1 = NULL, |
| 28 | + text2 = NULL, |
| 29 | + tokenizer = NULL, |
| 30 | + ignore = NULL, |
| 31 | + clean = NULL, |
| 32 | + distance = c("lv", "osa", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex"), |
| 33 | + ... |
| 34 | +){ |
| 35 | + # checking input |
| 36 | + if( is.function(distance) ){ stop("using non standard distance functions is not implemented yet - sorry") } |
| 37 | + stopifnot( |
| 38 | + !is.null(text1), |
| 39 | + is.character(text1), |
| 40 | + !is.null(text2), |
| 41 | + is.character(text2) |
| 42 | + ) |
| 43 | + |
| 44 | + # assigning default options |
| 45 | + if( is.null(tokenizer) ){ tokenizer <- stringb::text_tokenize_lines } |
| 46 | + if( is.null(clean) ){ clean <- function(x){x} } |
| 47 | + if( is.null(ignore) ){ ignore <- function(x){x} } |
| 48 | + distance <- distance[1] |
| 49 | + |
| 50 | + # tokenize |
| 51 | + message(" - tokenizing text") |
| 52 | + text1_tokenized <- tokenizer(text1)[1:3] |
| 53 | + text1_tokenized$token_i <- seq_along(text1_tokenized$token) |
| 54 | + |
| 55 | + text2_tokenized <- tokenizer(text2)[1:3] |
| 56 | + text2_tokenized$token_i <- seq_along(text2_tokenized$token) |
| 57 | + |
| 58 | + # clean |
| 59 | + message(" - cleaning token") |
| 60 | + text1_tokenized$token <- clean(text1_tokenized$token) |
| 61 | + text2_tokenized$token <- clean(text2_tokenized$token) |
| 62 | + |
| 63 | + # ignore |
| 64 | + message(" - ignoring token") |
| 65 | + text1_tokenized_prei <- text1_tokenized |
| 66 | + text2_tokenized_prei <- text2_tokenized |
| 67 | + text1_tokenized <- ignore(text1_tokenized) |
| 68 | + text2_tokenized <- ignore(text2_tokenized) |
| 69 | + |
| 70 | + # alignment and distances |
| 71 | + message(" - doing distance calculation and alignment") |
| 72 | + |
| 73 | + text1_tokenized <- setNames(text1_tokenized, c("from_1", "to_1", "token_1", "token_i_1")) |
| 74 | + text2_tokenized <- setNames(text2_tokenized, c("from_2", "to_2", "token_2", "token_2_1")) |
| 75 | + |
| 76 | + # distance |
| 77 | + a <- |
| 78 | + stringdist::amatch( |
| 79 | + text1_tokenized$token_1, |
| 80 | + text2_tokenized$token_2, |
| 81 | + method = distance, |
| 82 | + ... |
| 83 | + ) |
| 84 | + |
| 85 | + # alignment |
| 86 | + alignment <- |
| 87 | + data.frame( |
| 88 | + text1_tokenized, |
| 89 | + text2_tokenized[a, ] |
| 90 | + ) |
| 91 | + |
| 92 | + alignment$distance <- |
| 93 | + stringdist::stringdist( |
| 94 | + alignment$token_1, |
| 95 | + alignment$token_2, |
| 96 | + method = distance |
| 97 | + ) |
| 98 | + |
| 99 | + # type and distances |
| 100 | + alignment$type <- "" |
| 101 | + alignment$type[alignment$distance == 0]<-"no-change" |
| 102 | + alignment$type[alignment$distance > 0]<-"change" |
| 103 | + |
| 104 | + iffer <- is.na(alignment$token_1) |
| 105 | + alignment[iffer, "type"] <- "insertion" |
| 106 | + alignment[iffer, "distance"] <- stringdist::stringdist("", alignment[iffer, "token_2"]) |
| 107 | + |
| 108 | + iffer <- is.na(alignment$token_2) |
| 109 | + alignment[iffer, "type"] <- "deletion" |
| 110 | + alignment[iffer, "distance"] <- stringdist::stringdist("", alignment[iffer, "token_1"]) |
| 111 | + |
| 112 | + # non matches |
| 113 | + tmp <- |
| 114 | + subset( |
| 115 | + cbind(text1_tokenized, type="ignored"), |
| 116 | + !(text1_tokenized$token_i_1 %in% alignment$token_i_1) |
| 117 | + ) |
| 118 | + alignment <- |
| 119 | + rtext:::rbind_fill(alignment, tmp) |
| 120 | + |
| 121 | + tmp <- |
| 122 | + subset( |
| 123 | + cbind(text2_tokenized, type="ignored"), |
| 124 | + !(text2_tokenized$token_i_2 %in% alignment$token_i_2) |
| 125 | + ) |
| 126 | + alignment <- |
| 127 | + rtext:::rbind_fill(alignment, tmp) |
| 128 | + |
| 129 | + # return |
| 130 | + return(alignment) |
| 131 | +} |
| 132 | + |
| 133 | + |
| 134 | + |
| 135 | + |
| 136 | + |
| 137 | + |
| 138 | + |
| 139 | + |
| 140 | + |
| 141 | + |
| 142 | + |
| 143 | + |
| 144 | + |
| 145 | + |
| 146 | + |
| 147 | + |
| 148 | + |
| 149 | + |
| 150 | + |
| 151 | + |
| 152 | + |
| 153 | + |
0 commit comments