1- # ' FUNCTION_TITLE
1+ # ' algining texts
22# '
3- # ' FUNCTION_DESCRIPTION
3+ # ' Function aligns two texts side by side as a data.frame with change type and
4+ # ' distance given as well
45# '
56# ' @param text1 first text
67# ' @param text2 second text
1920# ' @param distance defaults to Levenshtein ("lv"); see \link[stringdist]{amatch},
2021# ' \link[stringdist]{stringdist-metrics}, \link[stringdist]{stringdist}
2122# ' @param ... further arguments passed through to distance function
23+ # ' @inheritParams stringdist::stringdist
2224# '
2325# ' @return dataframe with tokens aligned according to distance
2426# '
2527# ' @export
26- diffr <- function (
28+ diff_align <- function (
2729 text1 = NULL ,
2830 text2 = NULL ,
2931 tokenizer = NULL ,
3032 ignore = NULL ,
3133 clean = NULL ,
3234 distance = c(" lv" , " osa" , " dl" , " hamming" , " lcs" , " qgram" , " cosine" , " jaccard" , " jw" , " soundex" ),
35+ useBytes = FALSE ,
36+ weight = c(d = 1 , i = 1 , s = 1 , t = 1 ),
37+ maxDist = Inf ,
38+ q = 1 ,
39+ p = 0 ,
40+ nthread = getOption(" sd_num_thread" ),
3341 ...
3442){
3543 # checking input
@@ -45,6 +53,8 @@ diffr <- function(
4553 if ( is.null(tokenizer ) ){ tokenizer <- stringb :: text_tokenize_lines }
4654 if ( is.null(clean ) ){ clean <- function (x ){x } }
4755 if ( is.null(ignore ) ){ ignore <- function (x ){x } }
56+ if ( length(text1 ) > 1 ){ text1 <- text_collapse(text1 ) }
57+ if ( length(text2 ) > 1 ){ text2 <- text_collapse(text2 ) }
4858 distance <- distance [1 ]
4959
5060 # tokenize
@@ -67,19 +77,27 @@ diffr <- function(
6777 text1_tokenized <- ignore(text1_tokenized )
6878 text2_tokenized <- ignore(text2_tokenized )
6979
80+ # column naming
81+ text1_tokenized_prei <- stats :: setNames(text1_tokenized_prei , c(" from_1" , " to_1" , " token_1" , " token_i_1" ))
82+ text2_tokenized_prei <- stats :: setNames(text2_tokenized_prei , c(" from_2" , " to_2" , " token_2" , " token_i_2" ))
83+ text1_tokenized <- stats :: setNames(text1_tokenized , c(" from_1" , " to_1" , " token_1" , " token_i_1" ))
84+ text2_tokenized <- stats :: setNames(text2_tokenized , c(" from_2" , " to_2" , " token_2" , " token_i_2" ))
85+
7086 # alignment and distances
7187 message(" - doing distance calculation and alignment" )
7288
73- text1_tokenized <- setNames(text1_tokenized , c(" from_1" , " to_1" , " token_1" , " token_i_1" ))
74- text2_tokenized <- setNames(text2_tokenized , c(" from_2" , " to_2" , " token_2" , " token_2_1" ))
75-
7689 # distance
7790 a <-
7891 stringdist :: amatch(
7992 text1_tokenized $ token_1 ,
8093 text2_tokenized $ token_2 ,
8194 method = distance ,
82- ...
95+ useBytes = useBytes ,
96+ weight = weight ,
97+ maxDist = maxDist ,
98+ q = q ,
99+ p = p ,
100+ nthread = nthread
83101 )
84102
85103 # alignment
@@ -93,40 +111,113 @@ diffr <- function(
93111 stringdist :: stringdist(
94112 alignment $ token_1 ,
95113 alignment $ token_2 ,
96- method = distance
114+ method = distance ,
115+ useBytes = useBytes ,
116+ weight = weight ,
117+ q = q ,
118+ p = p ,
119+ nthread = nthread
97120 )
98121
99122 # type and distances
100- alignment $ type <- " "
101- alignment $ type [alignment $ distance == 0 ]<- " no-change"
102- alignment $ type [alignment $ distance > 0 ]<- " change"
123+ if ( dim1(alignment ) > 0 ){
124+ alignment $ type <- " "
125+ alignment $ type [alignment $ distance == 0 ]<- " no-change"
126+ alignment $ type [alignment $ distance > 0 ]<- " change"
103127
104- iffer <- is.na(alignment $ token_1 )
105- alignment [iffer , " type" ] <- " insertion"
106- alignment [iffer , " distance" ] <- stringdist :: stringdist(" " , alignment [iffer , " token_2" ])
128+ alignment <-
129+ rtext ::: rbind_fill(
130+ alignment ,
131+ text1_tokenized [
132+ ! (text1_tokenized $ token_i_1 %in% alignment $ token_i_1 ),
133+ ]
134+ )
135+
136+ alignment <-
137+ rtext ::: rbind_fill(
138+ alignment ,
139+ text2_tokenized [
140+ ! (text2_tokenized $ token_i_2 %in% alignment $ token_i_2 ),
141+ ]
142+ )
107143
108144 iffer <- is.na(alignment $ token_2 )
109145 alignment [iffer , " type" ] <- " deletion"
110- alignment [iffer , " distance" ] <- stringdist :: stringdist(" " , alignment [iffer , " token_1" ])
146+ alignment [iffer , " distance" ] <-
147+ stringdist :: stringdist(
148+ " " ,
149+ alignment [iffer , " token_1" ],
150+ method = distance ,
151+ useBytes = useBytes ,
152+ weight = weight ,
153+ q = q ,
154+ p = p ,
155+ nthread = nthread
156+ )
111157
112- # non matches
113- tmp <-
114- subset(
115- cbind(text1_tokenized , type = " ignored" ),
116- ! (text1_tokenized $ token_i_1 %in% alignment $ token_i_1 )
158+ iffer <- is.na(alignment $ token_1 )
159+ alignment [iffer , " type" ] <- " insertion"
160+ alignment [iffer , " distance" ] <-
161+ stringdist :: stringdist(
162+ " " ,
163+ alignment [iffer , " token_2" ],
164+ method = distance ,
165+ useBytes = useBytes ,
166+ weight = weight ,
167+ q = q ,
168+ p = p ,
169+ nthread = nthread
117170 )
118- alignment <-
119- rtext ::: rbind_fill(alignment , tmp )
120171
172+ alignment $ token_1 <-
173+ dplyr :: left_join(
174+ subset(alignment , TRUE , token_i_1 ),
175+ subset(text1_tokenized_prei , TRUE , c(token_i_1 , token_1 ) ),
176+ by = c(" token_i_1" = " token_i_1" )
177+ )$ token_1
178+
179+ alignment $ token_2 <-
180+ dplyr :: left_join(
181+ subset(alignment , TRUE , token_i_2 ),
182+ subset(text2_tokenized_prei , TRUE , c(token_i_2 , token_2 ) ),
183+ by = c(" token_i_2" = " token_i_2" )
184+ )$ token_2
185+ }
186+
187+ # non matches
188+ if ( dim1(text1_tokenized_prei )> 0 ){
189+ tmp <-
190+ subset(
191+ cbind(text1_tokenized_prei , type = " ignored" ),
192+ ! (text1_tokenized_prei $ token_i_2 %in% alignment $ token_i_1 )
193+ )
194+ alignment <-
195+ rtext ::: rbind_fill(alignment , tmp )
196+ }
197+
198+ if ( dim1(text2_tokenized_prei )> 0 ){
121199 tmp <-
122200 subset(
123- cbind(text2_tokenized , type = " ignored" ),
124- ! (text2_tokenized $ token_i_2 %in% alignment $ token_i_2 )
201+ cbind(text2_tokenized_prei , type = " ignored" ),
202+ ! (text2_tokenized_prei $ token_i_2 %in% alignment $ token_i_2 )
125203 )
126204 alignment <-
127205 rtext ::: rbind_fill(alignment , tmp )
206+ }
128207
129208 # return
209+ if ( ! (" type" %in% names(alignment )) ){
210+ alignment <- cbind(alignment , type = character (0 ))
211+ }
212+ alignment <-
213+ subset(
214+ alignment ,
215+ select = c(
216+ token_i_1 , token_i_2 , distance , type ,
217+ from_1 , to_1 , from_2 , to_2 ,
218+ token_1 , token_2
219+ )
220+ )
130221 return (alignment )
131222}
132223
0 commit comments