Skip to content

Commit 2db9222

Browse files
committed
passing all checks
1 parent d002a04 commit 2db9222

File tree

12 files changed

+106
-62
lines changed

12 files changed

+106
-62
lines changed

DESCRIPTION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ Imports:
3838
dtplyr (>= 0.0.1),
3939
Rcpp (>= 0.12.6),
4040
stringdist (>= 0.9.4.1),
41+
magrittr,
4142
stats,
4243
graphics
4344
Suggests:

NAMESPACE

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,6 @@ export("%>%")
44
export(diff_align)
55
export(diffrproject)
66
export(dp_text_base_data)
7-
export(moc_helper_easy_matches)
8-
export(moc_helper_get_options_ordered_by_dist)
9-
export(moc_helper_trivial_matches)
107
import(data.table)
118
import(hellno)
129
import(rtext)

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ version 0.1.2 // 2016-08-26 ...
1111

1212
* FEATURE
1313
- tests tests tests
14+
- passing all checks
1415

1516

1617
* DEVELOPMENT

R/diffr.R

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ diff_align <- function(
3434
distance = c("lv", "osa", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex"),
3535
useBytes = FALSE,
3636
weight = c(d = 1, i = 1, s = 1, t = 1),
37-
maxDist = Inf,
37+
maxDist = 0,
3838
q = 1,
3939
p = 0,
4040
nthread = getOption("sd_num_thread"),
@@ -56,6 +56,7 @@ diff_align <- function(
5656
if( length(text1) > 1){ text1 <- text_collapse(text1) }
5757
if( length(text2) > 1){ text2 <- text_collapse(text2) }
5858
distance <- distance[1]
59+
if(maxDist == 0){ maxDist <- 1e-150}
5960

6061
# tokenize
6162
message(" - tokenizing text")
@@ -67,6 +68,8 @@ diff_align <- function(
6768

6869
# clean
6970
message(" - cleaning token")
71+
text1_tokenized_prec <- text1_tokenized
72+
text2_tokenized_prec <- text2_tokenized
7073
text1_tokenized$token <- clean(text1_tokenized$token)
7174
text2_tokenized$token <- clean(text2_tokenized$token)
7275

@@ -78,6 +81,8 @@ diff_align <- function(
7881
text2_tokenized <- ignore(text2_tokenized)
7982

8083
# column naming
84+
text1_tokenized_prec <- stats::setNames(text1_tokenized_prec, c("from_1", "to_1", "token_1", "token_i_1"))
85+
text2_tokenized_prec <- stats::setNames(text2_tokenized_prec, c("from_2", "to_2", "token_2", "token_i_2"))
8186
text1_tokenized_prei <- stats::setNames(text1_tokenized_prei, c("from_1", "to_1", "token_1", "token_i_1"))
8287
text2_tokenized_prei <- stats::setNames(text2_tokenized_prei, c("from_2", "to_2", "token_2", "token_i_2"))
8388
text1_tokenized <- stats::setNames(text1_tokenized, c("from_1", "to_1", "token_1", "token_i_1"))
@@ -97,7 +102,8 @@ diff_align <- function(
97102
maxDist = maxDist,
98103
q = q,
99104
p = p,
100-
nthread = nthread
105+
nthread = nthread,
106+
matchNA = FALSE
101107
)
102108

103109
# alignment
@@ -169,27 +175,14 @@ diff_align <- function(
169175
nthread = nthread
170176
)
171177

172-
alignment$token_1 <-
173-
dplyr::left_join(
174-
subset(alignment, TRUE, token_i_1),
175-
subset(text1_tokenized_prei, TRUE, c(token_i_1, token_1) ),
176-
by=c("token_i_1"="token_i_1")
177-
)$token_1
178-
179-
alignment$token_2 <-
180-
dplyr::left_join(
181-
subset(alignment, TRUE, token_i_2),
182-
subset(text2_tokenized_prei, TRUE, c(token_i_2, token_2) ),
183-
by=c("token_i_2"="token_i_2")
184-
)$token_2
185178
}
186179

187180
# non matches
188181
if( dim1(text1_tokenized_prei)>0 ){
189182
tmp <-
190183
subset(
191184
cbind(text1_tokenized_prei, type="ignored"),
192-
!(text1_tokenized_prei$token_i_2 %in% alignment$token_i_1)
185+
!(text1_tokenized_prei$token_i_1 %in% alignment$token_i_1)
193186
)
194187
alignment <-
195188
rtext:::rbind_fill(alignment, tmp)
@@ -205,19 +198,39 @@ diff_align <- function(
205198
rtext:::rbind_fill(alignment, tmp)
206199
}
207200

208-
# return
201+
# original token
202+
if( dim1(alignment) > 0 ){
203+
alignment$token_1 <-
204+
dplyr::left_join(
205+
subset(alignment, select="token_i_1"),
206+
subset(text1_tokenized_prec, select=c("token_i_1", "token_1") ),
207+
by=c("token_i_1"="token_i_1")
208+
)$token_1
209+
210+
alignment$token_2 <-
211+
dplyr::left_join(
212+
subset(alignment, TRUE, token_i_2),
213+
subset(text2_tokenized_prec, select=c("token_i_2", "token_2") ),
214+
by=c("token_i_2"="token_i_2")
215+
)$token_2
216+
}
217+
218+
# column order and missing columns
209219
if( !("type" %in% names(alignment)) ){
210220
alignment <- cbind(alignment, type=character(0))
211221
}
222+
212223
alignment <-
213224
subset(
214225
alignment,
215226
select=c(
216-
token_i_1, token_i_2, distance, type,
217-
from_1, to_1, from_2, to_2,
218-
token_1, token_2
227+
"token_i_1", "token_i_2", "distance", "type",
228+
"from_1", "to_1", "from_2", "to_2",
229+
"token_1", "token_2"
219230
)
220231
)
232+
233+
# return
221234
return(alignment)
222235
}
223236

R/moc.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,11 @@ moc <- function(
4040
# prepare tt1 and tt2 as lists of data.frames
4141
tt1 <-
4242
text1_tokenized %>%
43-
filter( !(token_i %in% res$token_i_1) )
43+
dplyr::filter( !(token_i %in% res$token_i_1) )
4444

4545
tt2 <-
4646
text2_tokenized %>%
47-
filter( !(token_i %in% res$token_i_2) )
47+
dplyr::filter( !(token_i %in% res$token_i_2) )
4848

4949
tt1_split <- split_tt_by_length(tt1)
5050
tt2_split <- split_tt_by_length(tt2)

R/moc_helper.R

Lines changed: 22 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,11 @@
33
#' @keywords internal
44
split_tt_by_length <- function(tt){
55
tt %>%
6-
dplyr::mutate(
7-
token_length = nchar(token)
8-
) %>%
9-
split(
10-
.$token_length
11-
) %>%
12-
lapply(
13-
dplyr::mutate,
14-
token_length = NULL
15-
) %>%
16-
lapply(
17-
as.data.table
18-
) %>%
19-
lapply(
20-
setkey,
21-
token, token_i
22-
)
6+
dplyr::mutate( token_length = nchar(token) ) %>%
7+
split( .$token_length ) %>%
8+
lapply( dplyr::mutate, token_length = NULL ) %>%
9+
lapply( as.data.table ) %>%
10+
lapply( setkey, "token", "token_i" )
2311
}
2412

2513

@@ -28,20 +16,20 @@ split_tt_by_length <- function(tt){
2816
#' method of comparison helper function
2917
#' @param tt1 tokenized text number 1
3018
#' @param tt2 tokenized text number 2
31-
#' @export
19+
#' @keywords internal
3220
moc_helper_trivial_matches <- function(tt1, tt2){
3321
# preparation
34-
tt1 <- subset( tt1, is_unique(token), c(token, token_i))
22+
tt1 <- subset( tt1, is_unique(token), select=c("token", "token_i"))
3523
tt1 <- data.table::as.data.table(tt1)
36-
data.table::setkey(tt1, token)
24+
data.table::setkey("tt1", "token")
3725

38-
tt2 <- subset( tt2, is_unique(token), c(token, token_i))
26+
tt2 <- subset( tt2, is_unique(token), select=c("token", "token_i"))
3927
tt2 <- data.table::as.data.table(tt2)
40-
data.table::setkey(tt2, token)
28+
data.table::setkey("tt2", "token")
4129

4230
# merge / join
4331
matches <- suppressWarnings(dplyr::inner_join(tt1, tt2, by="token"))
44-
data.table::setkey(matches, token_i.x, token_i.y)
32+
data.table::setkey(matches, "token_i.x", "token_i.y")
4533

4634
# clean up names
4735
names(matches) <-
@@ -59,7 +47,7 @@ moc_helper_trivial_matches <- function(tt1, tt2){
5947
#' method of comparison helper function
6048
#' @param tt1 tokenized text number 1
6149
#' @param tt2 tokenized text number 2
62-
#' @export
50+
#' @keywords internal
6351
moc_helper_easy_matches <- function(tt1, tt2, res, type=c(1,2), fullreturn=TRUE){
6452
# check input
6553
if( is.null(tt1) | is.null(tt2) ){
@@ -73,12 +61,12 @@ moc_helper_easy_matches <- function(tt1, tt2, res, type=c(1,2), fullreturn=TRUE)
7361
# preparation
7462
tt1_tmp <-
7563
tt1 %>%
76-
dplyr::select(token, token_i) %>%
64+
subset(select = c("token", "token_i") ) %>%
7765
dplyr::filter(
7866
!(token_i %in% res$token_i_1)
7967
) %>%
8068
as.data.table()
81-
setkey(tt1_tmp, token_i)
69+
setkey(tt1_tmp, "token_i")
8270

8371
tt2_tmp <-
8472
tt2 %>%
@@ -87,7 +75,7 @@ moc_helper_easy_matches <- function(tt1, tt2, res, type=c(1,2), fullreturn=TRUE)
8775
!(token_i %in% res$token_i_2)
8876
) %>%
8977
as.data.table()
90-
setkey(tt2_tmp, token_i)
78+
setkey(tt2_tmp, "token_i")
9179

9280
# decide which tokens (from text1 or from text2) should be unique
9381
if( type == 1){
@@ -106,10 +94,10 @@ moc_helper_easy_matches <- function(tt1, tt2, res, type=c(1,2), fullreturn=TRUE)
10694
chosen <-
10795
choose_options(matches$token_i_1, matches$token_i_2, res$token_i_1, res$token_i_2) %>%
10896
as.data.table() %>%
109-
setkey(token_i_1)
97+
setkey("token_i_1")
11098

11199
# add token to get it rbind-ed to res
112-
tt1_tmp <- setNames(tt1_tmp, c("token", "token_i_1"))
100+
tt1_tmp <- stats::setNames(tt1_tmp, c("token", "token_i_1"))
113101
chosen <- dplyr::left_join(chosen, tt1_tmp, by="token_i_1")
114102

115103
# return
@@ -128,7 +116,7 @@ moc_helper_easy_matches <- function(tt1, tt2, res, type=c(1,2), fullreturn=TRUE)
128116
#' @param tt2 tokenized text number 2
129117
#' @param res data.frame of already matched
130118
#' @import data.table
131-
#' @export
119+
#' @keywords internal
132120
moc_helper_get_options_ordered_by_dist <- function(tt1, tt2, res){
133121
# distance between availible token positions and positions of tokens already matched
134122
dist <- which_dist_min_absolute(tt1$token_i, res$token_i_1)
@@ -137,7 +125,7 @@ moc_helper_get_options_ordered_by_dist <- function(tt1, tt2, res){
137125
res_tmp <-
138126
res[dist$location, ] %>%
139127
dplyr::select(token_i_1, token_i_2) %>%
140-
setNames( paste0("res_",names(.)) )
128+
stats::setNames( paste0("res_",names(.)) )
141129
# combine res with info from tt1
142130
tt1_tmp <-
143131
tt1 %>%
@@ -160,10 +148,10 @@ moc_helper_get_options_ordered_by_dist <- function(tt1, tt2, res){
160148
# delete columns
161149
tt1_tmp[, res_token_i_2 := NULL]
162150
# sort
163-
data.table::setorder(tt1_tmp, min_dist_1, min_dist_2, token_i_1, token_i_2)
151+
data.table::setorder(tt1_tmp, "min_dist_1", "min_dist_2", "token_i_1", "token_i_2")
164152
# delete columns
165-
tt1_tmp[, min_dist_1 := NULL]
166-
tt1_tmp[, min_dist_2 := NULL]
153+
tt1_tmp[, "min_dist_1" := NULL]
154+
tt1_tmp[, "min_dist_2" := NULL]
167155
# return
168156
return(tt1_tmp)
169157
}

autotest.Rexec

100644100755
File mode changed.

man/diff_align.Rd

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/moc_helper_easy_matches.Rd

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/moc_helper_get_options_ordered_by_dist.Rd

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)