@@ -6,6 +6,7 @@ is_minimum <- diffrprojects:::is_minimum
66dim1 <- diffrprojects ::: dim1
77which_dist_min_absolute <- diffrprojects ::: which_dist_min_absolute
88choose_options <- diffrprojects ::: choose_options
9+ split_tt_by_length <- diffrprojects ::: split_tt_by_length
910
1011
1112library(dplyr )
@@ -30,20 +31,20 @@ text2 <- rtext$new(text_file=text_files[14], encoding="latin1")$text_get()
3031# text2 <- rtext$new(text_file=stringb:::test_file("rc_3.txt"))$text_get()
3132
3233tokenizer <- text_tokenize_words
33- ignore = function (from , to , token , token_i ){rep( FALSE , length( token )) }
34+ ignore = function (... ){ FALSE }
3435clean = function (token ){token }
35- distance = function ( token1 , token2 ){ matrix ( 0 , nrow = length( token1 ), ncol = length( token2 ))}
36+ distance = c( " osa " , " lv " , " dl " , " hamming " , " lcs " , " qgram " , " cosine " , " jaccard " , " jw " , " soundex " )
3637
3738# ### ---------------------------------------------------------------------------
3839
3940
40- moc <- function (
41+ diffr <- function (
4142 text1 = NULL ,
4243 text2 = NULL ,
4344 tokenizer = function (text ){text_tokenize_lines(text )},
44- ignore = function ( from , to , token , token_i ){rep( FALSE , length( token ))} ,
45- clean = function ( token ){ token } ,
46- distance = function ( token1 , token2 ){ matrix ( 0 , nrow = length( token1 ), ncol = length( token2 ))}
45+ ignore = NULL ,
46+ clean = NULL ,
47+ distance = c( " osa " , " lv " , " dl " , " hamming " , " lcs " , " qgram " , " cosine " , " jaccard " , " jw " , " soundex " )
4748){}
4849
4950 # tokenize
@@ -55,97 +56,29 @@ moc <- function(
5556 text2_tokenized $ token_i <- seq_along(text2_tokenized $ token )
5657
5758 # clean
58- message(" - cleaning token" )
59- text1_tokenized $ token <- clean(text1_tokenized $ token )
60- text2_tokenized $ token <- clean(text2_tokenized $ token )
61-
62- # ignore
63- message(" - ignoring token" )
64- text1_tokenized <- text1_tokenized %> % dplyr :: filter( ! ignore(text1_tokenized ) )
65- text2_tokenized <- text2_tokenized %> % dplyr :: filter( ! ignore(text2_tokenized ) )
66-
67- # alignment and distances
68-
69- # ### trivial matches -- unique equal token matches
70- message(" - trivial matching" )
71- res <-
72- moc_helper_trivial_matches( tt1 = text1_tokenized , tt2 = text2_tokenized )
73-
74-
75- # ### easy matches -- text1 non-unique equal token matches
76- message(" - easy matching 1" )
77- res <-
78- rbind(
79- res ,
80- moc_helper_easy_matches( tt1 = text1_tokenized , tt2 = text2_tokenized , res = res , type = 1 )
81- )
82-
83-
84- # ### easy matches -- text2 non-unique equal token matches
85- message(" - easy matching 2" )
86- res <-
87- rbind(
88- res ,
89- moc_helper_easy_matches( tt1 = text1_tokenized , tt2 = text2_tokenized , res = res , type = 2 )
90- )
91-
92- # ### easy matches -- text2 non-unique equal token matches
93- message(" - easy matching 3" )
94-
95- # prepare tt1 and tt2 as lists of data.frames
96- tt1 <-
97- text1_tokenized %> %
98- filter( ! (token_i %in% res $ token_i_1 ) ) %> %
99- mutate( token_length = nchar(token ) ) %> %
100- split(. $ token_length ) %> %
101- lapply( dplyr :: mutate , token_length = NULL ) %> %
102- lapply(as.data.table ) %> %
103- lapply(setkey , token , token_i )
104-
105-
106- tt2 <-
107- text2_tokenized %> %
108- filter( ! (token_i %in% res $ token_i_2 ) ) %> %
109- mutate( token_length = nchar(token ) ) %> %
110- split(. $ token_length ) %> %
111- lapply( dplyr :: mutate , token_length = NULL ) %> %
112- lapply(as.data.table ) %> %
113- lapply(setkey , token , token_i )
114-
115- tt_names <- unique(c(names(tt1 ), names(tt2 )))
116-
117- # do the matches
118- for ( i in rev(seq_along(tt_names )) ) {
119- cat(i , " " , append = TRUE )
120- res <-
121- moc_helper_easy_matches(
122- tt1 = tt1 [[tt_names [i ]]],
123- tt2 = tt2 [[tt_names [i ]]],
124- res = res ,
125- type = 3
126- )
59+ if ( ! is.null(clean ) ){
60+ message(" - cleaning token" )
61+ text1_tokenized $ token <- clean(text1_tokenized $ token )
62+ text2_tokenized $ token <- clean(text2_tokenized $ token )
12763 }
128- cat(" \n " )
129-
130- # finishing matching of no-change type
131- res $ type <- " no-change"
132- res $ diff <- 0
13364
134- # ### using dist function to match remaining
135- tt1 <-
136- text1_tokenized %> %
137- filter( ! (token_i %in% res $ token_i_1 ) )
13865
139- tt2 <-
140- text2_tokenized %> %
141- filter( ! (token_i %in% res $ token_i_2 ) )
142-
143-
144- # long strings first
145- a <- adist(rep(tt1 $ token ), rep(tt2 $ token ))
146- pryr :: object_size(a )
66+ # ignore
67+ if ( ! is.null(ignore ) ){
68+ message(" - ignoring token" )
69+ text1_tokenized <- ignore(text1_tokenized )
70+ text2_tokenized <- ignore(text2_tokenized )
71+ }
14772
14873
74+ # alignment and distances
75+ if ( is.character(distance ) ){
76+ message(" - doing distance calculation and alignment" )
77+ a <- stringdist :: amatch(text1_tokenized $ token , text2_tokenized $ token , method = distance )
78+ alignment <- data.frame (text1_tokenized , text2_tokenized [a , ])
79+ }else {
80+ stop(" using non standard distance functions is not implemented yet - sorry" )
81+ }
14982
15083
15184
0 commit comments