Skip to content

Commit d002a04

Browse files
committed
diff_align implemented
1 parent e321569 commit d002a04

18 files changed

+412
-237
lines changed

.Rbuildignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ todo.txt
88
dev.R
99
benchmarks/*
1010
meta.R
11+
dev_save.R

.gitignore

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
.Rproj.user
2-
.Rhistory
3-
.RData
4-
.Ruserdata
5-
inst/doc
6-
README.html
1+
.Rproj.user
2+
.Rhistory
3+
.RData
4+
.Ruserdata
5+
inst/doc
6+
README.html

DESCRIPTION

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: diffrprojects
22
Title: Using diffr for more than two files
3-
Date: 2016-08-01
4-
Version: 0.1.2.90000
3+
Date: 2016-08-26
4+
Version: 0.1.3.90000
55
Authors@R: c(
66
person(
77
"Peter", "Meissner",
@@ -33,8 +33,6 @@ LazyData: TRUE
3333
Imports:
3434
R6 (>= 2.1.2),
3535
hellno (>= 0.0.1),
36-
magrittr (>= 1.5),
37-
digest (>= 0.6.9),
3836
dplyr(>= 0.5.0),
3937
data.table (>= 1.9.6),
4038
dtplyr (>= 0.0.1),

NAMESPACE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Generated by roxygen2: do not edit by hand
22

33
export("%>%")
4-
export(diffr)
4+
export(diff_align)
55
export(diffrproject)
66
export(dp_text_base_data)
77
export(moc_helper_easy_matches)

NEWS.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,22 @@
11
NEWS diffrprojects
22
==========================================================================
33

4+
5+
version 0.1.2 // 2016-08-26 ...
6+
--------------------------------------------------------------------------
7+
8+
* BUGFIXES
9+
10+
11+
12+
* FEATURE
13+
- tests tests tests
14+
15+
16+
* DEVELOPMENT
17+
18+
19+
420
version 0.1.2 // 2016-06-09 ...
521
--------------------------------------------------------------------------
622

@@ -9,6 +25,7 @@ version 0.1.2 // 2016-06-09 ...
925

1026

1127
* FEATURE
28+
- diff_align()
1229

1330

1431
* DEVELOPMENT

R/diffr.R

Lines changed: 115 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
#' FUNCTION_TITLE
1+
#' algining texts
22
#'
3-
#' FUNCTION_DESCRIPTION
3+
#' Function aligns two texts side by side as a data.frame with change type and
4+
#' distance given as well
45
#'
56
#' @param text1 first text
67
#' @param text2 second text
@@ -19,17 +20,24 @@
1920
#' @param distance defaults to Levenshtein ("lv"); see \link[stringdist]{amatch},
2021
#' \link[stringdist]{stringdist-metrics}, \link[stringdist]{stringdist}
2122
#' @param ... further arguments passed through to distance function
23+
#' @inheritParams stringdist::stringdist
2224
#'
2325
#' @return dataframe with tokens aligned according to distance
2426
#'
2527
#' @export
26-
diffr <- function(
28+
diff_align <- function(
2729
text1 = NULL,
2830
text2 = NULL,
2931
tokenizer = NULL,
3032
ignore = NULL,
3133
clean = NULL,
3234
distance = c("lv", "osa", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw", "soundex"),
35+
useBytes = FALSE,
36+
weight = c(d = 1, i = 1, s = 1, t = 1),
37+
maxDist = Inf,
38+
q = 1,
39+
p = 0,
40+
nthread = getOption("sd_num_thread"),
3341
...
3442
){
3543
# checking input
@@ -45,6 +53,8 @@ diffr <- function(
4553
if( is.null(tokenizer) ){ tokenizer <- stringb::text_tokenize_lines }
4654
if( is.null(clean) ){ clean <- function(x){x} }
4755
if( is.null(ignore) ){ ignore <- function(x){x} }
56+
if( length(text1) > 1){ text1 <- text_collapse(text1) }
57+
if( length(text2) > 1){ text2 <- text_collapse(text2) }
4858
distance <- distance[1]
4959

5060
# tokenize
@@ -67,19 +77,27 @@ diffr <- function(
6777
text1_tokenized <- ignore(text1_tokenized)
6878
text2_tokenized <- ignore(text2_tokenized)
6979

80+
# column naming
81+
text1_tokenized_prei <- stats::setNames(text1_tokenized_prei, c("from_1", "to_1", "token_1", "token_i_1"))
82+
text2_tokenized_prei <- stats::setNames(text2_tokenized_prei, c("from_2", "to_2", "token_2", "token_i_2"))
83+
text1_tokenized <- stats::setNames(text1_tokenized, c("from_1", "to_1", "token_1", "token_i_1"))
84+
text2_tokenized <- stats::setNames(text2_tokenized, c("from_2", "to_2", "token_2", "token_i_2"))
85+
7086
# alignment and distances
7187
message(" - doing distance calculation and alignment")
7288

73-
text1_tokenized <- setNames(text1_tokenized, c("from_1", "to_1", "token_1", "token_i_1"))
74-
text2_tokenized <- setNames(text2_tokenized, c("from_2", "to_2", "token_2", "token_2_1"))
75-
7689
# distance
7790
a <-
7891
stringdist::amatch(
7992
text1_tokenized$token_1,
8093
text2_tokenized$token_2,
8194
method = distance,
82-
...
95+
useBytes = useBytes,
96+
weight = weight,
97+
maxDist = maxDist,
98+
q = q,
99+
p = p,
100+
nthread = nthread
83101
)
84102

85103
# alignment
@@ -93,40 +111,113 @@ diffr <- function(
93111
stringdist::stringdist(
94112
alignment$token_1,
95113
alignment$token_2,
96-
method = distance
114+
method = distance,
115+
useBytes = useBytes,
116+
weight = weight,
117+
q = q,
118+
p = p,
119+
nthread = nthread
97120
)
98121

99122
# type and distances
100-
alignment$type <- ""
101-
alignment$type[alignment$distance == 0]<-"no-change"
102-
alignment$type[alignment$distance > 0]<-"change"
123+
if( dim1(alignment) > 0 ){
124+
alignment$type <- ""
125+
alignment$type[alignment$distance == 0]<-"no-change"
126+
alignment$type[alignment$distance > 0]<-"change"
103127

104-
iffer <- is.na(alignment$token_1)
105-
alignment[iffer, "type"] <- "insertion"
106-
alignment[iffer, "distance"] <- stringdist::stringdist("", alignment[iffer, "token_2"])
128+
alignment <-
129+
rtext:::rbind_fill(
130+
alignment,
131+
text1_tokenized[
132+
!(text1_tokenized$token_i_1 %in% alignment$token_i_1),
133+
]
134+
)
135+
136+
alignment <-
137+
rtext:::rbind_fill(
138+
alignment,
139+
text2_tokenized[
140+
!(text2_tokenized$token_i_2 %in% alignment$token_i_2),
141+
]
142+
)
107143

108144
iffer <- is.na(alignment$token_2)
109145
alignment[iffer, "type"] <- "deletion"
110-
alignment[iffer, "distance"] <- stringdist::stringdist("", alignment[iffer, "token_1"])
146+
alignment[iffer, "distance"] <-
147+
stringdist::stringdist(
148+
"",
149+
alignment[iffer, "token_1"],
150+
method = distance,
151+
useBytes = useBytes,
152+
weight = weight,
153+
q = q,
154+
p = p,
155+
nthread = nthread
156+
)
111157

112-
# non matches
113-
tmp <-
114-
subset(
115-
cbind(text1_tokenized, type="ignored"),
116-
!(text1_tokenized$token_i_1 %in% alignment$token_i_1)
158+
iffer <- is.na(alignment$token_1)
159+
alignment[iffer, "type"] <- "insertion"
160+
alignment[iffer, "distance"] <-
161+
stringdist::stringdist(
162+
"",
163+
alignment[iffer, "token_2"],
164+
method = distance,
165+
useBytes = useBytes,
166+
weight = weight,
167+
q = q,
168+
p = p,
169+
nthread = nthread
117170
)
118-
alignment <-
119-
rtext:::rbind_fill(alignment, tmp)
120171

172+
alignment$token_1 <-
173+
dplyr::left_join(
174+
subset(alignment, TRUE, token_i_1),
175+
subset(text1_tokenized_prei, TRUE, c(token_i_1, token_1) ),
176+
by=c("token_i_1"="token_i_1")
177+
)$token_1
178+
179+
alignment$token_2 <-
180+
dplyr::left_join(
181+
subset(alignment, TRUE, token_i_2),
182+
subset(text2_tokenized_prei, TRUE, c(token_i_2, token_2) ),
183+
by=c("token_i_2"="token_i_2")
184+
)$token_2
185+
}
186+
187+
# non matches
188+
if( dim1(text1_tokenized_prei)>0 ){
189+
tmp <-
190+
subset(
191+
cbind(text1_tokenized_prei, type="ignored"),
192+
!(text1_tokenized_prei$token_i_2 %in% alignment$token_i_1)
193+
)
194+
alignment <-
195+
rtext:::rbind_fill(alignment, tmp)
196+
}
197+
198+
if( dim1(text2_tokenized_prei)>0 ){
121199
tmp <-
122200
subset(
123-
cbind(text2_tokenized, type="ignored"),
124-
!(text2_tokenized$token_i_2 %in% alignment$token_i_2)
201+
cbind(text2_tokenized_prei, type="ignored"),
202+
!(text2_tokenized_prei$token_i_2 %in% alignment$token_i_2)
125203
)
126204
alignment <-
127205
rtext:::rbind_fill(alignment, tmp)
206+
}
128207

129208
# return
209+
if( !("type" %in% names(alignment)) ){
210+
alignment <- cbind(alignment, type=character(0))
211+
}
212+
alignment <-
213+
subset(
214+
alignment,
215+
select=c(
216+
token_i_1, token_i_2, distance, type,
217+
from_1, to_1, from_2, to_2,
218+
token_1, token_2
219+
)
220+
)
130221
return(alignment)
131222
}
132223

R/imports.R

Lines changed: 0 additions & 14 deletions
This file was deleted.

0 commit comments

Comments
 (0)