Skip to content

Commit 49fba27

Browse files
committed
end of day
1 parent 5672ad5 commit 49fba27

19 files changed

+428
-99
lines changed

NAMESPACE

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@
33
export("%>%")
44
export(diffrproject)
55
export(dp_text_base_data)
6+
export(moc_helper_easy_matches)
7+
export(moc_helper_get_options_ordered_by_dist)
68
export(moc_helper_trivial_matches)
9+
import(data.table)
710
import(hellno)
811
import(rtext)
912
import(stringb)

R/RcppExports.R

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,19 @@
11
# This file was generated by Rcpp::compileAttributes
22
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
33

4+
#' (choose from a number of pre-sorted options)
5+
#' takes a vector pair of toki1 / toki2 and a vector pair of res_token_i_1 /
6+
#' res_token_i_2 and chooses so that each 1st and exh 2nd value only is used
7+
#' where res_token_i_x identiefies already used items.
8+
#' @param toki1 first number of number pair to choose from
9+
#' @param toki2 second number of number pair to choose from
10+
#' @param res_token_i_1 already used first numbers
11+
#' @param res_token_i_2 already used second numbers
12+
#' // @keywords internal
13+
choose_options <- function(toki1, toki2, res_token_i_1, res_token_i_2) {
14+
.Call('diffrprojects_choose_options', PACKAGE = 'diffrprojects', toki1, toki2, res_token_i_1, res_token_i_2)
15+
}
16+
417
#' (function to calculate distance matrix of integers)
518
#' takes vector of size n and vector of size m and gives back matrix of n rows and m columns
619
#' @param x a vector of type numeric

R/moc_helper.R

Lines changed: 121 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,144 @@
11
#' trivial matches
22
#'
3-
#' merhtod of comparison helper function
3+
#' merthod of comparison helper function
44
#' @param tt1 tokenized text number 1
55
#' @param tt2 tokenized text number 2
66
#' @export
77
moc_helper_trivial_matches <- function(tt1, tt2){
88
# preparation
9-
tt1 <- subset( tt1, TRUE, c(token, token_i))
9+
tt1 <- subset( tt1, is_unique(token), c(token, token_i))
1010
tt1 <- data.table::as.data.table(tt1)
1111
data.table::setkey(tt1, token)
1212

13-
tt2 <- subset( tt2, TRUE, c(token, token_i))
13+
tt2 <- subset( tt2, is_unique(token), c(token, token_i))
1414
tt2 <- data.table::as.data.table(tt2)
1515
data.table::setkey(tt2, token)
1616

1717
# merge / join
18-
matches <-
19-
suppressWarnings(dplyr::inner_join(tt1, tt2, by="token"))
20-
data.table::setkey(matches, token_i.x, token_i.y)
18+
matches <- suppressWarnings(dplyr::inner_join(tt1, tt2, by="token"))
19+
data.table::setkey(matches, token_i.x, token_i.y)
20+
2121
# clean up names
2222
names(matches) <-
2323
names(matches) %>%
2424
stringb::text_replace("\\.", "_") %>%
2525
stringb::text_replace("x", "1") %>%
2626
stringb::text_replace("y", "2")
27-
# keep unique matches only
28-
iffer <- unique(matches$token_i_1)
29-
matches <- matches[iffer, ]
30-
iffer <- unique(matches$token_i_2)
31-
matches <- matches[iffer, ]
27+
3228
# return
3329
return(matches)
3430
}
31+
32+
#' easy matches 1
33+
#'
34+
#' method of comparison helper function
35+
#' @param tt1 tokenized text number 1
36+
#' @param tt2 tokenized text number 2
37+
#' @export
38+
moc_helper_easy_matches <- function(tt1, tt2, res, type=c(1,2), fullreturn=TRUE){
39+
# check input
40+
if( is.null(tt1) | is.null(tt2) ){
41+
# return
42+
if( fullreturn ){
43+
return(res)
44+
}else{
45+
return(data.frame())
46+
}
47+
}
48+
# preparation
49+
tt1_tmp <-
50+
tt1 %>%
51+
dplyr::select(token, token_i) %>%
52+
dplyr::filter(
53+
!(token_i %in% res$token_i_1)
54+
) %>%
55+
as.data.table()
56+
setkey(tt1_tmp, token_i)
57+
58+
tt2_tmp <-
59+
tt2 %>%
60+
dplyr::select(token, token_i) %>%
61+
dplyr::filter(
62+
!(token_i %in% res$token_i_2)
63+
) %>%
64+
as.data.table()
65+
setkey(tt2_tmp, token_i)
66+
67+
# decide which tokens (from text1 or from text2) should be unique
68+
if( type == 1){
69+
tt1_tmp <- tt1_tmp %>% dplyr::filter( is_unique(token) )
70+
}else if( type == 2){
71+
tt2_tmp <- tt2_tmp %>% dplyr::filter( is_unique(token) )
72+
}
73+
74+
# get and order possible matches
75+
matches <-
76+
suppressWarnings(
77+
moc_helper_get_options_ordered_by_dist(tt1_tmp, tt2_tmp, res)
78+
)
79+
80+
# process optional matches
81+
chosen <-
82+
choose_options(matches$token_i_1, matches$token_i_2, res$token_i_1, res$token_i_2) %>%
83+
as.data.table() %>%
84+
setkey(token_i_1)
85+
86+
# add token to get it rbind-ed to res
87+
tt1_tmp <- setNames(tt1_tmp, c("token", "token_i_1"))
88+
chosen <- dplyr::left_join(chosen, tt1_tmp, by="token_i_1")
89+
90+
# return
91+
if( fullreturn ){
92+
return(rbind(res,chosen))
93+
}else{
94+
return(chosen)
95+
}
96+
}
97+
98+
99+
#' get options for machtches
100+
#'
101+
#' method of comparison helper function
102+
#' @param tt1 tokenized text number 1
103+
#' @param tt2 tokenized text number 2
104+
#' @param res data.frame of already matched
105+
#' @import data.table
106+
#' @export
107+
moc_helper_get_options_ordered_by_dist <- function(tt1, tt2, res){
108+
# distance between availible token positions and positions of tokens already matched
109+
dist <- which_dist_min_absolute(tt1$token_i, res$token_i_1)
110+
tt1$min_dist_1 <- dist$minimum
111+
# preapare information from res
112+
res_tmp <-
113+
res[dist$location, ] %>%
114+
dplyr::select(token_i_1, token_i_2) %>%
115+
setNames( paste0("res_",names(.)) )
116+
# combine res with info from tt1
117+
tt1_tmp <-
118+
tt1 %>%
119+
dplyr::select(token, token_i, min_dist_1) %>%
120+
cbind(res_tmp)
121+
# join tt1 and tt2
122+
tt2_tmp <- dplyr::select(tt2, token, token_i)
123+
tt1_tmp <-
124+
tt1_tmp %>%
125+
dplyr::inner_join(tt2_tmp, by="token")
126+
names(tt1_tmp)[names(tt1_tmp)=="token_i.x"] <- "token_i_1"
127+
names(tt1_tmp)[names(tt1_tmp)=="token_i.y"] <- "token_i_2"
128+
tt1_tmp <- data.table::as.data.table(tt1_tmp)
129+
# delete columns
130+
tt1_tmp[, token := NULL]
131+
tt1_tmp[, res_token_i_1 := NULL]
132+
# add token_i_2 position distance
133+
tt1_tmp$min_dist_2 <- 0L
134+
tt1_tmp$min_dist_2 <- abs(tt1_tmp$res_token_i_2 - tt1_tmp$token_i_2)
135+
# delete columns
136+
tt1_tmp[, res_token_i_2 := NULL]
137+
# sort
138+
data.table::setorder(tt1_tmp, min_dist_1, min_dist_2, token_i_1, token_i_2)
139+
# delete columns
140+
tt1_tmp[, min_dist_1 := NULL]
141+
tt1_tmp[, min_dist_2 := NULL]
142+
# return
143+
return(tt1_tmp)
144+
}

README.Rmd

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,32 @@ cat("#", tmp$Title)
1717

1818
**Status**
1919

20-
*unstable* - in wild developement with fuRiouS rEstRucturINg and biG biG pOKing
2120

2221
[![Travis-CI Build Status](https://travis-ci.org/petermeissner/diffrprojects.svg?branch=master)](https://travis-ci.org/petermeissner/diffrprojects)
2322
[![codecov](https://codecov.io/gh/petermeissner/diffrprojects/branch/master/graph/badge.svg)](https://codecov.io/gh/petermeissner/diffrprojects/tree/master/R)
2423
[![CRAN version](http://www.r-pkg.org/badges/version/diffrprojects)](https://cran.r-project.org/package=diffrprojects)
2524

2625

26+
27+
```{r, include=FALSE}
28+
filelist.R <- list.files("R", recursive = TRUE, pattern="\\.R$", ignore.case = TRUE, full.names = TRUE)
29+
filelist.tests <- list.files("tests", recursive = TRUE, pattern="\\.R$", ignore.case = TRUE, full.names = TRUE)
30+
filelist.cpp <- list.files("src", recursive = TRUE, pattern="\\.cpp$", ignore.case = TRUE, full.names = TRUE)
31+
lines.R <- unlist(lapply(filelist.R, readLines))
32+
lines.tests <- unlist(lapply(filelist.cpp, readLines))
33+
lines.cpp <- unlist(lapply(filelist.cpp, readLines))
34+
length.R <- length(grep("(^\\s*$)|(^\\s*#)|(^\\s*//)", lines.R, value = TRUE, invert = TRUE))
35+
length.tests <- length(grep("(^\\s*$)|(^\\s*#)|(^\\s*//)", lines.tests, value = TRUE, invert = TRUE))
36+
length.cpp <- length(grep("(^\\s*$)|(^\\s*#)|(^\\s*//)", lines.cpp, value = TRUE, invert = TRUE))
37+
```
38+
39+
40+
*unstable* - in wild developement with fuRiouS rEstRucturINg and biG biG pOKing
41+
42+
*lines of R code:* `r length.R`, *lines of C++ code:* `r length.cpp`, *lines of test code:* `r length.tests`
43+
44+
45+
2746
**Version**
2847

2948
```{r, results='asis', echo=FALSE}

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@ Using diffr for more than two files
44

55
**Status**
66

7+
[![Travis-CI Build Status](https://travis-ci.org/petermeissner/diffrprojects.svg?branch=master)](https://travis-ci.org/petermeissner/diffrprojects) [![codecov](https://codecov.io/gh/petermeissner/diffrprojects/branch/master/graph/badge.svg)](https://codecov.io/gh/petermeissner/diffrprojects/tree/master/R) [![CRAN version](http://www.r-pkg.org/badges/version/diffrprojects)](https://cran.r-project.org/package=diffrprojects)
8+
79
*unstable* - in wild developement with fuRiouS rEstRucturINg and biG biG pOKing
810

9-
[![Travis-CI Build Status](https://travis-ci.org/petermeissner/diffrprojects.svg?branch=master)](https://travis-ci.org/petermeissner/diffrprojects) [![codecov](https://codecov.io/gh/petermeissner/diffrprojects/branch/master/graph/badge.svg)](https://codecov.io/gh/petermeissner/diffrprojects/tree/master/R) [![CRAN version](http://www.r-pkg.org/badges/version/diffrprojects)](https://cran.r-project.org/package=diffrprojects)
11+
*lines of R code:* 388, *lines of C++ code:* 112, *lines of test code:* 112
1012

1113
**Version**
1214

0 commit comments

Comments
 (0)