Skip to content

Commit e7a26f0

Browse files
committed
dev
1 parent 49fba27 commit e7a26f0

File tree

2 files changed

+195
-92
lines changed

2 files changed

+195
-92
lines changed

dev.R

Lines changed: 25 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ is_minimum <- diffrprojects:::is_minimum
66
dim1 <- diffrprojects:::dim1
77
which_dist_min_absolute <- diffrprojects:::which_dist_min_absolute
88
choose_options <- diffrprojects:::choose_options
9+
split_tt_by_length <- diffrprojects:::split_tt_by_length
910

1011

1112
library(dplyr)
@@ -30,20 +31,20 @@ text2 <- rtext$new(text_file=text_files[14], encoding="latin1")$text_get()
3031
#text2 <- rtext$new(text_file=stringb:::test_file("rc_3.txt"))$text_get()
3132

3233
tokenizer <- text_tokenize_words
33-
ignore = function(from, to, token, token_i){rep(FALSE, length(token))}
34+
ignore = function(...){FALSE}
3435
clean = function(token){token}
35-
distance = function(token1, token2){matrix(0, nrow = length(token1), ncol = length(token2))}
36+
distance = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex")
3637

3738
#### ---------------------------------------------------------------------------
3839

3940

40-
moc <- function(
41+
diffr <- function(
4142
text1 = NULL,
4243
text2 = NULL,
4344
tokenizer = function(text){text_tokenize_lines(text)},
44-
ignore = function(from, to, token, token_i){rep(FALSE, length(token))},
45-
clean = function(token){token},
46-
distance = function(token1, token2){matrix(0, nrow = length(token1), ncol = length(token2))}
45+
ignore = NULL,
46+
clean = NULL,
47+
distance = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex")
4748
){}
4849

4950
# tokenize
@@ -55,97 +56,29 @@ moc <- function(
5556
text2_tokenized$token_i <- seq_along(text2_tokenized$token)
5657

5758
# clean
58-
message(" - cleaning token")
59-
text1_tokenized$token <- clean(text1_tokenized$token)
60-
text2_tokenized$token <- clean(text2_tokenized$token)
61-
62-
# ignore
63-
message(" - ignoring token")
64-
text1_tokenized <- text1_tokenized %>% dplyr::filter( !ignore(text1_tokenized) )
65-
text2_tokenized <- text2_tokenized %>% dplyr::filter( !ignore(text2_tokenized) )
66-
67-
# alignment and distances
68-
69-
#### trivial matches -- unique equal token matches
70-
message(" - trivial matching")
71-
res <-
72-
moc_helper_trivial_matches( tt1 = text1_tokenized, tt2 = text2_tokenized )
73-
74-
75-
#### easy matches -- text1 non-unique equal token matches
76-
message(" - easy matching 1")
77-
res <-
78-
rbind(
79-
res,
80-
moc_helper_easy_matches( tt1 = text1_tokenized, tt2 = text2_tokenized, res= res, type=1)
81-
)
82-
83-
84-
#### easy matches -- text2 non-unique equal token matches
85-
message(" - easy matching 2")
86-
res <-
87-
rbind(
88-
res,
89-
moc_helper_easy_matches( tt1 = text1_tokenized, tt2 = text2_tokenized, res= res, type=2)
90-
)
91-
92-
#### easy matches -- text2 non-unique equal token matches
93-
message(" - easy matching 3")
94-
95-
# prepare tt1 and tt2 as lists of data.frames
96-
tt1 <-
97-
text1_tokenized %>%
98-
filter( !(token_i %in% res$token_i_1) ) %>%
99-
mutate( token_length = nchar(token) ) %>%
100-
split(.$token_length) %>%
101-
lapply( dplyr::mutate, token_length = NULL ) %>%
102-
lapply(as.data.table) %>%
103-
lapply(setkey, token, token_i)
104-
105-
106-
tt2 <-
107-
text2_tokenized %>%
108-
filter( !(token_i %in% res$token_i_2) ) %>%
109-
mutate( token_length = nchar(token) ) %>%
110-
split(.$token_length) %>%
111-
lapply( dplyr::mutate, token_length = NULL ) %>%
112-
lapply(as.data.table) %>%
113-
lapply(setkey, token, token_i)
114-
115-
tt_names <- unique(c(names(tt1), names(tt2)))
116-
117-
# do the matches
118-
for( i in rev(seq_along(tt_names)) ) {
119-
cat(i, " ", append=TRUE)
120-
res <-
121-
moc_helper_easy_matches(
122-
tt1 = tt1[[tt_names[i]]],
123-
tt2 = tt2[[tt_names[i]]],
124-
res=res,
125-
type=3
126-
)
59+
if( !is.null(clean) ){
60+
message(" - cleaning token")
61+
text1_tokenized$token <- clean(text1_tokenized$token)
62+
text2_tokenized$token <- clean(text2_tokenized$token)
12763
}
128-
cat("\n")
129-
130-
# finishing matching of no-change type
131-
res$type <- "no-change"
132-
res$diff <- 0
13364

134-
#### using dist function to match remaining
135-
tt1 <-
136-
text1_tokenized %>%
137-
filter( !(token_i %in% res$token_i_1) )
13865

139-
tt2 <-
140-
text2_tokenized %>%
141-
filter( !(token_i %in% res$token_i_2) )
142-
143-
144-
# long strings first
145-
a <- adist(rep(tt1$token), rep(tt2$token))
146-
pryr::object_size(a)
66+
# ignore
67+
if( !is.null(ignore) ){
68+
message(" - ignoring token")
69+
text1_tokenized <- ignore(text1_tokenized)
70+
text2_tokenized <- ignore(text2_tokenized)
71+
}
14772

14873

74+
# alignment and distances
75+
if( is.character(distance) ){
76+
message(" - doing distance calculation and alignment")
77+
a <- stringdist::amatch(text1_tokenized$token, text2_tokenized$token, method=distance)
78+
alignment <- data.frame(text1_tokenized, text2_tokenized[a, ])
79+
}else{
80+
stop("using non standard distance functions is not implemented yet - sorry")
81+
}
14982

15083

15184

dev_save.R

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
# this is some trying out coming up with a general framework for
2+
# non stanard distance functions
3+
#### ---------------------------------------------------------------------------
4+
5+
library(diffrprojects)
6+
is_unique <- diffrprojects:::is_unique
7+
is_minimum <- diffrprojects:::is_minimum
8+
dim1 <- diffrprojects:::dim1
9+
which_dist_min_absolute <- diffrprojects:::which_dist_min_absolute
10+
choose_options <- diffrprojects:::choose_options
11+
12+
13+
library(dplyr)
14+
library(data.table)
15+
library(dtplyr)
16+
library(Rcpp)
17+
18+
19+
20+
21+
22+
#### ---------------------------------------------------------------------------
23+
24+
text_path <- "~/Dropbox/IDEP_Database/rawdata/AUT/txts"
25+
26+
text_files <- list.files(text_path, pattern = "txt", full.names = TRUE)
27+
28+
text1 <- rtext$new(text_file=text_files[13], encoding="latin1")$text_get()
29+
text2 <- rtext$new(text_file=text_files[14], encoding="latin1")$text_get()
30+
31+
#text1 <- rtext$new(text_file=stringb:::test_file("rc_2.txt"))$text_get()
32+
#text2 <- rtext$new(text_file=stringb:::test_file("rc_3.txt"))$text_get()
33+
34+
tokenizer <- text_tokenize_words
35+
ignore = function(from, to, token, token_i){rep(FALSE, length(token))}
36+
clean = function(token){token}
37+
distance = function(token1, token2){matrix(0, nrow = length(token1), ncol = length(token2))}
38+
39+
#### ---------------------------------------------------------------------------
40+
41+
42+
moc <- function(
43+
text1 = NULL,
44+
text2 = NULL,
45+
tokenizer = function(text){text_tokenize_lines(text)},
46+
ignore = function(from, to, token, token_i){rep(FALSE, length(token))},
47+
clean = function(token){token},
48+
distance = function(token1, token2){matrix(0, nrow = length(token1), ncol = length(token2))}
49+
){}
50+
51+
# tokenize
52+
message(" - tokenizing text")
53+
text1_tokenized <- tokenizer(text1)[1:3]
54+
text1_tokenized$token_i <- seq_along(text1_tokenized$token)
55+
56+
text2_tokenized <- tokenizer(text2)[1:3]
57+
text2_tokenized$token_i <- seq_along(text2_tokenized$token)
58+
59+
# clean
60+
message(" - cleaning token")
61+
text1_tokenized$token <- clean(text1_tokenized$token)
62+
text2_tokenized$token <- clean(text2_tokenized$token)
63+
64+
# ignore
65+
message(" - ignoring token")
66+
text1_tokenized <- text1_tokenized %>% dplyr::filter( !ignore(text1_tokenized) )
67+
text2_tokenized <- text2_tokenized %>% dplyr::filter( !ignore(text2_tokenized) )
68+
69+
# alignment and distances
70+
71+
#### trivial matches -- unique equal token matches
72+
message(" - trivial matching")
73+
res <-
74+
moc_helper_trivial_matches( tt1 = text1_tokenized, tt2 = text2_tokenized )
75+
76+
77+
#### easy matches -- text1 non-unique equal token matches
78+
message(" - easy matching 1")
79+
res <-
80+
rbind(
81+
res,
82+
moc_helper_easy_matches( tt1 = text1_tokenized, tt2 = text2_tokenized, res= res, type=1)
83+
)
84+
85+
86+
#### easy matches -- text2 non-unique equal token matches
87+
message(" - easy matching 2")
88+
res <-
89+
rbind(
90+
res,
91+
moc_helper_easy_matches( tt1 = text1_tokenized, tt2 = text2_tokenized, res= res, type=2)
92+
)
93+
94+
#### easy matches -- text2 non-unique equal token matches
95+
message(" - easy matching 3")
96+
97+
# prepare tt1 and tt2 as lists of data.frames
98+
tt1 <-
99+
text1_tokenized %>%
100+
filter( !(token_i %in% res$token_i_1) ) %>%
101+
mutate( token_length = nchar(token) ) %>%
102+
split(.$token_length) %>%
103+
lapply( dplyr::mutate, token_length = NULL ) %>%
104+
lapply(as.data.table) %>%
105+
lapply(setkey, token, token_i)
106+
107+
108+
tt2 <-
109+
text2_tokenized %>%
110+
filter( !(token_i %in% res$token_i_2) ) %>%
111+
mutate( token_length = nchar(token) ) %>%
112+
split(.$token_length) %>%
113+
lapply( dplyr::mutate, token_length = NULL ) %>%
114+
lapply(as.data.table) %>%
115+
lapply(setkey, token, token_i)
116+
117+
tt_names <- unique(c(names(tt1), names(tt2)))
118+
119+
# do the matches
120+
for( i in rev(seq_along(tt_names)) ) {
121+
cat(i, " ", append=TRUE)
122+
res <-
123+
moc_helper_easy_matches(
124+
tt1 = tt1[[tt_names[i]]],
125+
tt2 = tt2[[tt_names[i]]],
126+
res=res,
127+
type=3
128+
)
129+
}
130+
cat("\n")
131+
132+
# finishing matching of no-change type
133+
res$type <- "no-change"
134+
res$diff <- 0
135+
136+
#### using dist function to match remaining
137+
tt1 <-
138+
text1_tokenized %>%
139+
filter( !(token_i %in% res$token_i_1) )
140+
141+
tt2 <-
142+
text2_tokenized %>%
143+
filter( !(token_i %in% res$token_i_2) )
144+
145+
146+
# long strings first
147+
a <- adist(rep(tt1$token), rep(tt2$token))
148+
pryr::object_size(a)
149+
150+
151+
152+
153+
154+
155+
#### using dist function to match remaining
156+
tt1 <-
157+
text1_tokenized %>%
158+
filter( !(token_i %in% res$token_i_1) )
159+
160+
tt2 <-
161+
text2_tokenized %>%
162+
filter( !(token_i %in% res$token_i_2) )
163+
164+
if( is.character(distance) ){
165+
a <- stringdist::amatch(text1_tokenized$token, text2_tokenized$token, method=distance, ...)
166+
}else{
167+
stop("Not Implemented!")
168+
distance(tt1, tt2)
169+
}
170+

0 commit comments

Comments
 (0)