Skip to content

Commit e321569

Browse files
committed
diffr done
1 parent e7a26f0 commit e321569

File tree

13 files changed

+341
-73
lines changed

13 files changed

+341
-73
lines changed

DESCRIPTION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ Imports:
3939
data.table (>= 1.9.6),
4040
dtplyr (>= 0.0.1),
4141
Rcpp (>= 0.12.6),
42+
stringdist (>= 0.9.4.1),
4243
stats,
4344
graphics
4445
Suggests:

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Generated by roxygen2: do not edit by hand
22

33
export("%>%")
4+
export(diffr)
45
export(diffrproject)
56
export(dp_text_base_data)
67
export(moc_helper_easy_matches)

R/diffr.R

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
#' FUNCTION_TITLE
2+
#'
3+
#' FUNCTION_DESCRIPTION
4+
#'
5+
#' @param text1 first text
6+
#' @param text2 second text
7+
#' @param tokenizer defaults to NULL which will trigger linewise tokenization;
8+
#' accepts a function that turns a text into a token data frame;
9+
#' a token data frame has at least three columns:
10+
#' from (first character of token),
11+
#' to (last character of token)
12+
#' token (the token)
13+
#' @param ignore defaults to NULL which means that nothing is ignored;
14+
#' function that accepts a token data frame (see above) and returns a
15+
#' possibly subseted data frame of hte same form
16+
#' @param clean defaults to NULL which means that nothing cleaned; accepts a
17+
#' function that takes a vector of tokens and returns a vector of same
18+
#' length - potentially clean up
19+
#' @param distance defaults to Levenshtein ("lv"); see \link[stringdist]{amatch},
20+
#' \link[stringdist]{stringdist-metrics}, \link[stringdist]{stringdist}
21+
#' @param ... further arguments passed through to distance function
22+
#'
23+
#' @return dataframe with tokens aligned according to distance
24+
#'
25+
#' @export
26+
diffr <- function(
27+
text1 = NULL,
28+
text2 = NULL,
29+
tokenizer = NULL,
30+
ignore = NULL,
31+
clean = NULL,
32+
distance = c("lv", "osa", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex"),
33+
...
34+
){
35+
# checking input
36+
if( is.function(distance) ){ stop("using non standard distance functions is not implemented yet - sorry") }
37+
stopifnot(
38+
!is.null(text1),
39+
is.character(text1),
40+
!is.null(text2),
41+
is.character(text2)
42+
)
43+
44+
# assigning default options
45+
if( is.null(tokenizer) ){ tokenizer <- stringb::text_tokenize_lines }
46+
if( is.null(clean) ){ clean <- function(x){x} }
47+
if( is.null(ignore) ){ ignore <- function(x){x} }
48+
distance <- distance[1]
49+
50+
# tokenize
51+
message(" - tokenizing text")
52+
text1_tokenized <- tokenizer(text1)[1:3]
53+
text1_tokenized$token_i <- seq_along(text1_tokenized$token)
54+
55+
text2_tokenized <- tokenizer(text2)[1:3]
56+
text2_tokenized$token_i <- seq_along(text2_tokenized$token)
57+
58+
# clean
59+
message(" - cleaning token")
60+
text1_tokenized$token <- clean(text1_tokenized$token)
61+
text2_tokenized$token <- clean(text2_tokenized$token)
62+
63+
# ignore
64+
message(" - ignoring token")
65+
text1_tokenized_prei <- text1_tokenized
66+
text2_tokenized_prei <- text2_tokenized
67+
text1_tokenized <- ignore(text1_tokenized)
68+
text2_tokenized <- ignore(text2_tokenized)
69+
70+
# alignment and distances
71+
message(" - doing distance calculation and alignment")
72+
73+
text1_tokenized <- setNames(text1_tokenized, c("from_1", "to_1", "token_1", "token_i_1"))
74+
text2_tokenized <- setNames(text2_tokenized, c("from_2", "to_2", "token_2", "token_2_1"))
75+
76+
# distance
77+
a <-
78+
stringdist::amatch(
79+
text1_tokenized$token_1,
80+
text2_tokenized$token_2,
81+
method = distance,
82+
...
83+
)
84+
85+
# alignment
86+
alignment <-
87+
data.frame(
88+
text1_tokenized,
89+
text2_tokenized[a, ]
90+
)
91+
92+
alignment$distance <-
93+
stringdist::stringdist(
94+
alignment$token_1,
95+
alignment$token_2,
96+
method = distance
97+
)
98+
99+
# type and distances
100+
alignment$type <- ""
101+
alignment$type[alignment$distance == 0]<-"no-change"
102+
alignment$type[alignment$distance > 0]<-"change"
103+
104+
iffer <- is.na(alignment$token_1)
105+
alignment[iffer, "type"] <- "insertion"
106+
alignment[iffer, "distance"] <- stringdist::stringdist("", alignment[iffer, "token_2"])
107+
108+
iffer <- is.na(alignment$token_2)
109+
alignment[iffer, "type"] <- "deletion"
110+
alignment[iffer, "distance"] <- stringdist::stringdist("", alignment[iffer, "token_1"])
111+
112+
# non matches
113+
tmp <-
114+
subset(
115+
cbind(text1_tokenized, type="ignored"),
116+
!(text1_tokenized$token_i_1 %in% alignment$token_i_1)
117+
)
118+
alignment <-
119+
rtext:::rbind_fill(alignment, tmp)
120+
121+
tmp <-
122+
subset(
123+
cbind(text2_tokenized, type="ignored"),
124+
!(text2_tokenized$token_i_2 %in% alignment$token_i_2)
125+
)
126+
alignment <-
127+
rtext:::rbind_fill(alignment, tmp)
128+
129+
# return
130+
return(alignment)
131+
}
132+
133+
134+
135+
136+
137+
138+
139+
140+
141+
142+
143+
144+
145+
146+
147+
148+
149+
150+
151+
152+
153+

R/moc.R

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#' stub
2+
#' @keywords internal
3+
moc <- function(
4+
text1 = NULL,
5+
text2 = NULL,
6+
tokenizer = function(text){text_tokenize_lines(text)},
7+
ignore = function(...){FALSE},
8+
clean = function(token){token},
9+
distance = function(token1, token2){matrix(0,nrow = length(token1), ncol = length(token2))},
10+
alignment = function(m){}
11+
){
12+
# alignment and distances
13+
14+
#### trivial matches -- unique equal token matches
15+
message(" - trivial matching")
16+
res <-
17+
moc_helper_trivial_matches( tt1 = text1_tokenized, tt2 = text2_tokenized )
18+
19+
20+
#### easy matches -- text1 non-unique equal token matches
21+
message(" - easy matching 1")
22+
res <-
23+
rbind(
24+
res,
25+
moc_helper_easy_matches( tt1 = text1_tokenized, tt2 = text2_tokenized, res= res, type=1)
26+
)
27+
28+
29+
#### easy matches -- text2 non-unique equal token matches
30+
message(" - easy matching 2")
31+
res <-
32+
rbind(
33+
res,
34+
moc_helper_easy_matches( tt1 = text1_tokenized, tt2 = text2_tokenized, res= res, type=2)
35+
)
36+
37+
#### easy matches -- text2 non-unique equal token matches
38+
message(" - easy matching 3")
39+
40+
# prepare tt1 and tt2 as lists of data.frames
41+
tt1 <-
42+
text1_tokenized %>%
43+
filter( !(token_i %in% res$token_i_1) )
44+
45+
tt2 <-
46+
text2_tokenized %>%
47+
filter( !(token_i %in% res$token_i_2) )
48+
49+
tt1_split <- split_tt_by_length(tt1)
50+
tt2_split <- split_tt_by_length(tt2)
51+
52+
tt_names <- unique(c(names(tt1_split), names(tt2_split)))
53+
54+
# do the matches
55+
for( i in rev(seq_along(tt_names)) ) {
56+
cat(i, " ", append=TRUE)
57+
res <-
58+
moc_helper_easy_matches(
59+
tt1 = tt1_split[[tt_names[i]]],
60+
tt2 = tt2_split[[tt_names[i]]],
61+
res=res,
62+
type=3
63+
)
64+
}
65+
cat("\n")
66+
67+
# finishing matching of no-change type
68+
res$type <- "no-change"
69+
res$diff <- 0
70+
}

R/moc_helper.R

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,31 @@
1+
#' splitting a tokenized text
2+
#' @param tt tokenized text
3+
#' @keywords internal
4+
split_tt_by_length <- function(tt){
5+
tt %>%
6+
dplyr::mutate(
7+
token_length = nchar(token)
8+
) %>%
9+
split(
10+
.$token_length
11+
) %>%
12+
lapply(
13+
dplyr::mutate,
14+
token_length = NULL
15+
) %>%
16+
lapply(
17+
as.data.table
18+
) %>%
19+
lapply(
20+
setkey,
21+
token, token_i
22+
)
23+
}
24+
25+
126
#' trivial matches
227
#'
3-
#' merthod of comparison helper function
28+
#' method of comparison helper function
429
#' @param tt1 tokenized text number 1
530
#' @param tt2 tokenized text number 2
631
#' @export
@@ -89,7 +114,7 @@ moc_helper_easy_matches <- function(tt1, tt2, res, type=c(1,2), fullreturn=TRUE)
89114

90115
# return
91116
if( fullreturn ){
92-
return(rbind(res,chosen))
117+
return( rbind(res, data.table(chosen), fill=TRUE) )
93118
}else{
94119
return(chosen)
95120
}

dev.R

Lines changed: 3 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,89 +1,22 @@
11
#### ---------------------------------------------------------------------------
22

33
library(diffrprojects)
4-
is_unique <- diffrprojects:::is_unique
5-
is_minimum <- diffrprojects:::is_minimum
6-
dim1 <- diffrprojects:::dim1
7-
which_dist_min_absolute <- diffrprojects:::which_dist_min_absolute
8-
choose_options <- diffrprojects:::choose_options
9-
split_tt_by_length <- diffrprojects:::split_tt_by_length
10-
11-
12-
library(dplyr)
13-
library(data.table)
14-
library(dtplyr)
15-
library(Rcpp)
16-
17-
18-
19-
204

215
#### ---------------------------------------------------------------------------
226

237
text_path <- "~/Dropbox/IDEP_Database/rawdata/AUT/txts"
248

259
text_files <- list.files(text_path, pattern = "txt", full.names = TRUE)
2610

27-
text1 <- rtext$new(text_file=text_files[13], encoding="latin1")$text_get()
28-
text2 <- rtext$new(text_file=text_files[14], encoding="latin1")$text_get()
11+
text1 <- rtext$new(text_file=text_files[13], encoding="latin1")$text_get(2000)
12+
text2 <- rtext$new(text_file=text_files[14], encoding="latin1")$text_get(2000)
2913

3014
#text1 <- rtext$new(text_file=stringb:::test_file("rc_2.txt"))$text_get()
3115
#text2 <- rtext$new(text_file=stringb:::test_file("rc_3.txt"))$text_get()
3216

33-
tokenizer <- text_tokenize_words
34-
ignore = function(...){FALSE}
35-
clean = function(token){token}
36-
distance = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex")
37-
3817
#### ---------------------------------------------------------------------------
3918

40-
41-
diffr <- function(
42-
text1 = NULL,
43-
text2 = NULL,
44-
tokenizer = function(text){text_tokenize_lines(text)},
45-
ignore = NULL,
46-
clean = NULL,
47-
distance = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex")
48-
){}
49-
50-
# tokenize
51-
message(" - tokenizing text")
52-
text1_tokenized <- tokenizer(text1)[1:3]
53-
text1_tokenized$token_i <- seq_along(text1_tokenized$token)
54-
55-
text2_tokenized <- tokenizer(text2)[1:3]
56-
text2_tokenized$token_i <- seq_along(text2_tokenized$token)
57-
58-
# clean
59-
if( !is.null(clean) ){
60-
message(" - cleaning token")
61-
text1_tokenized$token <- clean(text1_tokenized$token)
62-
text2_tokenized$token <- clean(text2_tokenized$token)
63-
}
64-
65-
66-
# ignore
67-
if( !is.null(ignore) ){
68-
message(" - ignoring token")
69-
text1_tokenized <- ignore(text1_tokenized)
70-
text2_tokenized <- ignore(text2_tokenized)
71-
}
72-
73-
74-
# alignment and distances
75-
if( is.character(distance) ){
76-
message(" - doing distance calculation and alignment")
77-
a <- stringdist::amatch(text1_tokenized$token, text2_tokenized$token, method=distance)
78-
alignment <- data.frame(text1_tokenized, text2_tokenized[a, ])
79-
}else{
80-
stop("using non standard distance functions is not implemented yet - sorry")
81-
}
82-
83-
84-
85-
86-
19+
diffr(text1, text2)
8720

8821

8922

dev.cpp

Whitespace-only changes.

dev_save.R

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,16 @@ moc <- function(
149149

150150

151151

152+
# alignment via hungarian solution to assignment problem
153+
154+
library(clue)
155+
156+
m <- adist(tt1$token, tt2$token)
157+
solution_index_v <- as.numeric(solve_LSAP(m))
158+
solution_index_m <- as.matrix(cbind(seq_along(solution_index_v),solution_index_v))
159+
160+
aligned <- cbind(tt1,tt2[solution_index,], dist = m [solution_index_m])
161+
152162

153163

154164

0 commit comments

Comments
 (0)