|
1 | 1 | #' trivial matches |
2 | 2 | #' |
3 | | -#' merhtod of comparison helper function |
| 3 | +#' merthod of comparison helper function |
4 | 4 | #' @param tt1 tokenized text number 1 |
5 | 5 | #' @param tt2 tokenized text number 2 |
6 | 6 | #' @export |
7 | 7 | moc_helper_trivial_matches <- function(tt1, tt2){ |
8 | 8 | # preparation |
9 | | - tt1 <- subset( tt1, TRUE, c(token, token_i)) |
| 9 | + tt1 <- subset( tt1, is_unique(token), c(token, token_i)) |
10 | 10 | tt1 <- data.table::as.data.table(tt1) |
11 | 11 | data.table::setkey(tt1, token) |
12 | 12 |
|
13 | | - tt2 <- subset( tt2, TRUE, c(token, token_i)) |
| 13 | + tt2 <- subset( tt2, is_unique(token), c(token, token_i)) |
14 | 14 | tt2 <- data.table::as.data.table(tt2) |
15 | 15 | data.table::setkey(tt2, token) |
16 | 16 |
|
17 | 17 | # merge / join |
18 | | - matches <- |
19 | | - suppressWarnings(dplyr::inner_join(tt1, tt2, by="token")) |
20 | | - data.table::setkey(matches, token_i.x, token_i.y) |
| 18 | + matches <- suppressWarnings(dplyr::inner_join(tt1, tt2, by="token")) |
| 19 | + data.table::setkey(matches, token_i.x, token_i.y) |
| 20 | + |
21 | 21 | # clean up names |
22 | 22 | names(matches) <- |
23 | 23 | names(matches) %>% |
24 | 24 | stringb::text_replace("\\.", "_") %>% |
25 | 25 | stringb::text_replace("x", "1") %>% |
26 | 26 | stringb::text_replace("y", "2") |
27 | | - # keep unique matches only |
28 | | - iffer <- unique(matches$token_i_1) |
29 | | - matches <- matches[iffer, ] |
30 | | - iffer <- unique(matches$token_i_2) |
31 | | - matches <- matches[iffer, ] |
| 27 | + |
32 | 28 | # return |
33 | 29 | return(matches) |
34 | 30 | } |
| 31 | + |
| 32 | +#' easy matches 1 |
| 33 | +#' |
| 34 | +#' method of comparison helper function |
| 35 | +#' @param tt1 tokenized text number 1 |
| 36 | +#' @param tt2 tokenized text number 2 |
| 37 | +#' @export |
| 38 | +moc_helper_easy_matches <- function(tt1, tt2, res, type=c(1,2), fullreturn=TRUE){ |
| 39 | + # check input |
| 40 | + if( is.null(tt1) | is.null(tt2) ){ |
| 41 | + # return |
| 42 | + if( fullreturn ){ |
| 43 | + return(res) |
| 44 | + }else{ |
| 45 | + return(data.frame()) |
| 46 | + } |
| 47 | + } |
| 48 | + # preparation |
| 49 | + tt1_tmp <- |
| 50 | + tt1 %>% |
| 51 | + dplyr::select(token, token_i) %>% |
| 52 | + dplyr::filter( |
| 53 | + !(token_i %in% res$token_i_1) |
| 54 | + ) %>% |
| 55 | + as.data.table() |
| 56 | + setkey(tt1_tmp, token_i) |
| 57 | + |
| 58 | + tt2_tmp <- |
| 59 | + tt2 %>% |
| 60 | + dplyr::select(token, token_i) %>% |
| 61 | + dplyr::filter( |
| 62 | + !(token_i %in% res$token_i_2) |
| 63 | + ) %>% |
| 64 | + as.data.table() |
| 65 | + setkey(tt2_tmp, token_i) |
| 66 | + |
| 67 | + # decide which tokens (from text1 or from text2) should be unique |
| 68 | + if( type == 1){ |
| 69 | + tt1_tmp <- tt1_tmp %>% dplyr::filter( is_unique(token) ) |
| 70 | + }else if( type == 2){ |
| 71 | + tt2_tmp <- tt2_tmp %>% dplyr::filter( is_unique(token) ) |
| 72 | + } |
| 73 | + |
| 74 | + # get and order possible matches |
| 75 | + matches <- |
| 76 | + suppressWarnings( |
| 77 | + moc_helper_get_options_ordered_by_dist(tt1_tmp, tt2_tmp, res) |
| 78 | + ) |
| 79 | + |
| 80 | + # process optional matches |
| 81 | + chosen <- |
| 82 | + choose_options(matches$token_i_1, matches$token_i_2, res$token_i_1, res$token_i_2) %>% |
| 83 | + as.data.table() %>% |
| 84 | + setkey(token_i_1) |
| 85 | + |
| 86 | + # add token to get it rbind-ed to res |
| 87 | + tt1_tmp <- setNames(tt1_tmp, c("token", "token_i_1")) |
| 88 | + chosen <- dplyr::left_join(chosen, tt1_tmp, by="token_i_1") |
| 89 | + |
| 90 | + # return |
| 91 | + if( fullreturn ){ |
| 92 | + return(rbind(res,chosen)) |
| 93 | + }else{ |
| 94 | + return(chosen) |
| 95 | + } |
| 96 | +} |
| 97 | + |
| 98 | + |
| 99 | +#' get options for machtches |
| 100 | +#' |
| 101 | +#' method of comparison helper function |
| 102 | +#' @param tt1 tokenized text number 1 |
| 103 | +#' @param tt2 tokenized text number 2 |
| 104 | +#' @param res data.frame of already matched |
| 105 | +#' @import data.table |
| 106 | +#' @export |
| 107 | +moc_helper_get_options_ordered_by_dist <- function(tt1, tt2, res){ |
| 108 | + # distance between availible token positions and positions of tokens already matched |
| 109 | + dist <- which_dist_min_absolute(tt1$token_i, res$token_i_1) |
| 110 | + tt1$min_dist_1 <- dist$minimum |
| 111 | + # preapare information from res |
| 112 | + res_tmp <- |
| 113 | + res[dist$location, ] %>% |
| 114 | + dplyr::select(token_i_1, token_i_2) %>% |
| 115 | + setNames( paste0("res_",names(.)) ) |
| 116 | + # combine res with info from tt1 |
| 117 | + tt1_tmp <- |
| 118 | + tt1 %>% |
| 119 | + dplyr::select(token, token_i, min_dist_1) %>% |
| 120 | + cbind(res_tmp) |
| 121 | + # join tt1 and tt2 |
| 122 | + tt2_tmp <- dplyr::select(tt2, token, token_i) |
| 123 | + tt1_tmp <- |
| 124 | + tt1_tmp %>% |
| 125 | + dplyr::inner_join(tt2_tmp, by="token") |
| 126 | + names(tt1_tmp)[names(tt1_tmp)=="token_i.x"] <- "token_i_1" |
| 127 | + names(tt1_tmp)[names(tt1_tmp)=="token_i.y"] <- "token_i_2" |
| 128 | + tt1_tmp <- data.table::as.data.table(tt1_tmp) |
| 129 | + # delete columns |
| 130 | + tt1_tmp[, token := NULL] |
| 131 | + tt1_tmp[, res_token_i_1 := NULL] |
| 132 | + # add token_i_2 position distance |
| 133 | + tt1_tmp$min_dist_2 <- 0L |
| 134 | + tt1_tmp$min_dist_2 <- abs(tt1_tmp$res_token_i_2 - tt1_tmp$token_i_2) |
| 135 | + # delete columns |
| 136 | + tt1_tmp[, res_token_i_2 := NULL] |
| 137 | + # sort |
| 138 | + data.table::setorder(tt1_tmp, min_dist_1, min_dist_2, token_i_1, token_i_2) |
| 139 | + # delete columns |
| 140 | + tt1_tmp[, min_dist_1 := NULL] |
| 141 | + tt1_tmp[, min_dist_2 := NULL] |
| 142 | + # return |
| 143 | + return(tt1_tmp) |
| 144 | +} |
0 commit comments