Skip to content

Commit 264c027

Browse files
author
Arseny Kositsyn
committed
[PGPRO-12159] Added the output of tsv lexemes positions.
If you create an index with the operator class rum_tsvector_ops, the positions of the lexemes will be saved as additional information. The positions are stored in compressed form in bytea. There is a problem that is related to the fact that in the posting tree, additional information for the senior keys is stored in a different way, which is why it has not yet been possible to output it. For all other cases, the output of additional information works correctly. Tags: rum
1 parent 7aab2f0 commit 264c027

File tree

3 files changed

+182
-38
lines changed

3 files changed

+182
-38
lines changed

src/rum.h

+6
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "storage/bufmgr.h"
2222
#include "utils/datum.h"
2323
#include "utils/memutils.h"
24+
#include "tsearch/ts_type.h"
2425

2526
#include "rumsort.h"
2627

@@ -836,6 +837,8 @@ extern RumItem *rumGetBAEntry(BuildAccumulator *accum,
836837
#define RUM_ADDINFO_JOIN 10
837838
#define RUMNProcs 10
838839

840+
#define LOWERMASK 0x1F
841+
839842
extern PGDLLEXPORT Datum rum_extract_tsvector(PG_FUNCTION_ARGS);
840843
extern PGDLLEXPORT Datum rum_extract_tsquery(PG_FUNCTION_ARGS);
841844
extern PGDLLEXPORT Datum rum_tsvector_config(PG_FUNCTION_ARGS);
@@ -847,6 +850,9 @@ extern PGDLLEXPORT Datum rum_ts_distance_td(PG_FUNCTION_ARGS);
847850

848851
extern PGDLLEXPORT Datum tsquery_to_distance_query(PG_FUNCTION_ARGS);
849852

853+
extern char* decompress_pos(char *ptr, WordEntryPos *pos);
854+
extern unsigned int count_pos(char *ptr, int len);
855+
850856
/* rum_arr_utils.c */
851857
typedef enum SimilarityType
852858
{

src/rum_debug_funcs.c

+174-32
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "access/relation.h"
2828
#include "utils/varlena.h"
2929
#include "rum.h"
30+
#include "tsearch/ts_type.h"
3031

3132
PG_FUNCTION_INFO_V1(rum_metapage_info);
3233
PG_FUNCTION_INFO_V1(rum_page_opaque_info);
@@ -115,6 +116,8 @@ static Datum category_get_datum_text(RumNullCategory category);
115116
static Oid find_add_info_oid(RumState *rum_state_ptr);
116117
static OffsetNumber find_add_info_atrr_num(RumState *rum_state_ptr);
117118

119+
static Datum get_positions_to_text_datum(Datum add_info);
120+
118121
/*
119122
* The rum_metapage_info() function is used to retrieve
120123
* information stored on the meta page of the rum index.
@@ -386,12 +389,6 @@ rum_leaf_data_page_items(PG_FUNCTION_ARGS)
386389
/* Allocating memory for a long-lived structure */
387390
inter_call_data = palloc(sizeof(rum_page_items_state));
388391

389-
/* Initializing the RumState structure */
390-
inter_call_data->rum_state_ptr = palloc(sizeof(RumState));
391-
initRumState(inter_call_data->rum_state_ptr, rel);
392-
393-
relation_close(rel, AccessShareLock);
394-
395392
/* Getting a copy of the page from the raw page */
396393
page = get_page_from_raw(raw_page);
397394

@@ -422,6 +419,12 @@ rum_leaf_data_page_items(PG_FUNCTION_ARGS)
422419
errdetail("Flags %04X, expected %04X",
423420
opaq->flags, (RUM_DATA | RUM_LEAF))));
424421

422+
/* Initializing the RumState structure */
423+
inter_call_data->rum_state_ptr = palloc(sizeof(RumState));
424+
initRumState(inter_call_data->rum_state_ptr, rel);
425+
426+
relation_close(rel, AccessShareLock);
427+
425428
/* Build a tuple descriptor for our result type */
426429
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
427430
elog(ERROR, "return type must be a row type");
@@ -494,9 +497,24 @@ rum_leaf_data_page_items(PG_FUNCTION_ARGS)
494497
values[2] = BoolGetDatum(high_key_ptr->addInfoIsNull);
495498

496499
/* Returning add info */
497-
if(!high_key_ptr->addInfoIsNull && inter_call_data->add_info_oid != 0)
500+
if(!(high_key_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0
501+
&& inter_call_data->add_info_oid != BYTEAOID)
502+
{
498503
values[3] = get_datum_text_by_oid(high_key_ptr->addInfo,
499504
inter_call_data->add_info_oid);
505+
}
506+
507+
/*
508+
* In this case, we are dealing with the positions
509+
* of tokens and they need to be decoded.
510+
*/
511+
else if (!(high_key_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0
512+
&& inter_call_data->add_info_oid == BYTEAOID)
513+
{
514+
/* values[3] = get_positions_to_text_datum(high_key_ptr->addInfo); */
515+
values[3] = CStringGetTextDatum("high key positions in posting tree is not supported");
516+
}
517+
500518
else nulls[3] = true;
501519

502520
/* Forming the returned tuple */
@@ -536,8 +554,23 @@ rum_leaf_data_page_items(PG_FUNCTION_ARGS)
536554
values[2] = BoolGetDatum(rum_item_ptr->addInfoIsNull);
537555

538556
/* Returning add info */
539-
if(!(rum_item_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0)
540-
values[3] = get_datum_text_by_oid(rum_item_ptr->addInfo, inter_call_data->add_info_oid);
557+
if(!(rum_item_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0
558+
&& inter_call_data->add_info_oid != BYTEAOID)
559+
{
560+
values[3] = get_datum_text_by_oid(rum_item_ptr->addInfo,
561+
inter_call_data->add_info_oid);
562+
}
563+
564+
/*
565+
* In this case, we are dealing with the positions
566+
* of tokens and they need to be decoded.
567+
*/
568+
else if (!(rum_item_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0
569+
&& inter_call_data->add_info_oid == BYTEAOID)
570+
{
571+
values[3] = get_positions_to_text_datum(rum_item_ptr->addInfo);
572+
}
573+
541574
else nulls[3] = true;
542575

543576
/* Forming the returned tuple */
@@ -619,12 +652,6 @@ rum_internal_data_page_items(PG_FUNCTION_ARGS)
619652
/* Allocating memory for a long-lived structure */
620653
inter_call_data = palloc(sizeof(rum_page_items_state));
621654

622-
/* Initializing the RumState structure */
623-
inter_call_data->rum_state_ptr = palloc(sizeof(RumState));
624-
initRumState(inter_call_data->rum_state_ptr, rel);
625-
626-
relation_close(rel, AccessShareLock);
627-
628655
/* Getting a copy of the page from the raw page */
629656
page = get_page_from_raw(raw_page);
630657

@@ -655,6 +682,12 @@ rum_internal_data_page_items(PG_FUNCTION_ARGS)
655682
errdetail("Flags %04X, expected %04X",
656683
opaq->flags, (RUM_DATA & ~RUM_LEAF))));
657684

685+
/* Initializing the RumState structure */
686+
inter_call_data->rum_state_ptr = palloc(sizeof(RumState));
687+
initRumState(inter_call_data->rum_state_ptr, rel);
688+
689+
relation_close(rel, AccessShareLock);
690+
658691
/* Build a tuple descriptor for our result type */
659692
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
660693
elog(ERROR, "return type must be a row type");
@@ -721,9 +754,24 @@ rum_internal_data_page_items(PG_FUNCTION_ARGS)
721754
values[3] = BoolGetDatum(high_key_ptr->addInfoIsNull);
722755

723756
/* Returning add info */
724-
if(!high_key_ptr->addInfoIsNull && inter_call_data->add_info_oid != 0)
757+
if(!(high_key_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0
758+
&& inter_call_data->add_info_oid != BYTEAOID)
759+
{
725760
values[4] = get_datum_text_by_oid(high_key_ptr->addInfo,
726761
inter_call_data->add_info_oid);
762+
}
763+
764+
/*
765+
* In this case, we are dealing with the positions
766+
* of tokens and they need to be decoded.
767+
*/
768+
else if (!(high_key_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0
769+
&& inter_call_data->add_info_oid == BYTEAOID)
770+
{
771+
/* values[4] = get_positions_to_text_datum(high_key_ptr->addInfo); */
772+
values[4] = CStringGetTextDatum("high key positions in posting tree is not supported");
773+
}
774+
727775
else nulls[4] = true;
728776

729777
/* Forming the returned tuple */
@@ -745,9 +793,24 @@ rum_internal_data_page_items(PG_FUNCTION_ARGS)
745793
values[3] = BoolGetDatum(posting_item_ptr->item.addInfoIsNull);
746794

747795
/* Returning add info */
748-
if(!posting_item_ptr->item.addInfoIsNull && inter_call_data->add_info_oid != 0)
796+
if(!posting_item_ptr->item.addInfoIsNull && inter_call_data->add_info_oid != 0
797+
&& inter_call_data->add_info_oid != BYTEAOID)
798+
{
749799
values[4] = get_datum_text_by_oid(posting_item_ptr->item.addInfo,
750800
inter_call_data->add_info_oid);
801+
}
802+
803+
/*
804+
* In this case, we are dealing with the positions
805+
* of tokens and they need to be decoded.
806+
*/
807+
else if (!posting_item_ptr->item.addInfoIsNull && inter_call_data->add_info_oid != 0
808+
&& inter_call_data->add_info_oid == BYTEAOID)
809+
{
810+
/* values[4] = get_positions_to_text_datum(posting_item_ptr->item.addInfo); */
811+
values[4] = CStringGetTextDatum("high key positions in posting tree is not supported");
812+
}
813+
751814
else nulls[4] = true;
752815

753816
/* Forming the returned tuple */
@@ -833,12 +896,6 @@ rum_leaf_entry_page_items(PG_FUNCTION_ARGS)
833896
/* Allocating memory for a long-lived structure */
834897
inter_call_data = palloc(sizeof(rum_page_items_state));
835898

836-
/* Initializing the RumState structure */
837-
inter_call_data->rum_state_ptr = palloc(sizeof(RumState));
838-
initRumState(inter_call_data->rum_state_ptr, rel);
839-
840-
relation_close(rel, AccessShareLock);
841-
842899
/* Getting a copy of the page from the raw page */
843900
page = get_page_from_raw(raw_page);
844901

@@ -869,6 +926,12 @@ rum_leaf_entry_page_items(PG_FUNCTION_ARGS)
869926
errdetail("Flags %04X, expected %04X",
870927
opaq->flags, RUM_LEAF)));
871928

929+
/* Initializing the RumState structure */
930+
inter_call_data->rum_state_ptr = palloc(sizeof(RumState));
931+
initRumState(inter_call_data->rum_state_ptr, rel);
932+
933+
relation_close(rel, AccessShareLock);
934+
872935
/* Build a tuple descriptor for our result type */
873936
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
874937
elog(ERROR, "return type must be a row type");
@@ -1008,10 +1071,23 @@ rum_leaf_entry_page_items(PG_FUNCTION_ARGS)
10081071
values[3] = ItemPointerGetDatum(&(rum_item_ptr->iptr));
10091072
values[4] = BoolGetDatum(rum_item_ptr->addInfoIsNull);
10101073

1011-
10121074
/* Returning add info */
1013-
if(!(rum_item_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0)
1075+
if (!(rum_item_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0 &&
1076+
inter_call_data->add_info_oid != BYTEAOID)
1077+
{
10141078
values[5] = get_datum_text_by_oid(rum_item_ptr->addInfo, inter_call_data->add_info_oid);
1079+
}
1080+
1081+
/*
1082+
* In this case, we are dealing with the positions
1083+
* of tokens and they need to be decoded.
1084+
*/
1085+
else if (!(rum_item_ptr->addInfoIsNull) && inter_call_data->add_info_oid != 0
1086+
&& inter_call_data->add_info_oid == BYTEAOID)
1087+
{
1088+
values[5] = get_positions_to_text_datum(rum_item_ptr->addInfo);
1089+
}
1090+
10151091
else nulls[5] = true;
10161092

10171093
/* The current IndexTuple does not contain a posting tree */
@@ -1101,12 +1177,6 @@ rum_internal_entry_page_items(PG_FUNCTION_ARGS)
11011177
/* Allocating memory for a long-lived structure */
11021178
inter_call_data = palloc(sizeof(rum_page_items_state));
11031179

1104-
/* Initializing the RumState structure */
1105-
inter_call_data->rum_state_ptr = palloc(sizeof(RumState));
1106-
initRumState(inter_call_data->rum_state_ptr, rel);
1107-
1108-
relation_close(rel, AccessShareLock);
1109-
11101180
/* Getting a copy of the page from the raw page */
11111181
page = get_page_from_raw(raw_page);
11121182

@@ -1137,6 +1207,12 @@ rum_internal_entry_page_items(PG_FUNCTION_ARGS)
11371207
errdetail("Flags %04X, expected %04X",
11381208
opaq->flags, 0)));
11391209

1210+
/* Initializing the RumState structure */
1211+
inter_call_data->rum_state_ptr = palloc(sizeof(RumState));
1212+
initRumState(inter_call_data->rum_state_ptr, rel);
1213+
1214+
relation_close(rel, AccessShareLock);
1215+
11401216
/* Build a tuple descriptor for our result type */
11411217
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
11421218
elog(ERROR, "return type must be a row type");
@@ -1355,7 +1431,7 @@ get_page_from_raw(bytea *raw_page)
13551431
* TODO: All types accepted by rum must be checked, but
13561432
* perhaps some types are missing or some are superfluous.
13571433
*/
1358-
static Datum
1434+
static Datum
13591435
get_datum_text_by_oid(Datum info, Oid info_oid)
13601436
{
13611437
char *str_info = NULL;
@@ -1602,3 +1678,69 @@ find_add_info_atrr_num(RumState *rum_state_ptr)
16021678
/* Need to add 1 because the attributes are numbered from 1 */
16031679
return add_info_attr_num + 1;
16041680
}
1681+
1682+
#define POS_STR_BUF_LENGHT 1024
1683+
#define POS_MAX_VAL_LENGHT 6
1684+
1685+
/*
1686+
* A function for extracting the positions of tokens from additional
1687+
* information. Returns a string in which the positions of the tokens
1688+
* are recorded. The memory that the string occupies must be cleared later.
1689+
*/
1690+
static Datum
1691+
get_positions_to_text_datum(Datum add_info)
1692+
{
1693+
bytea *positions;
1694+
char *ptrt;
1695+
WordEntryPos position = 0;
1696+
int32 npos;
1697+
1698+
Datum res;
1699+
char *positions_str;
1700+
char *positions_str_cur_ptr;
1701+
int cur_max_str_lenght;
1702+
1703+
positions = DatumGetByteaP(add_info);
1704+
ptrt = (char *) VARDATA_ANY(positions);
1705+
npos = count_pos(VARDATA_ANY(positions),
1706+
VARSIZE_ANY_EXHDR(positions));
1707+
1708+
/* Initialize the string */
1709+
positions_str = (char*) palloc(POS_STR_BUF_LENGHT * sizeof(char));
1710+
positions_str[0] = '\0';
1711+
cur_max_str_lenght = POS_STR_BUF_LENGHT;
1712+
positions_str_cur_ptr = positions_str;
1713+
1714+
/* Extract the positions of the tokens and put them in the string */
1715+
for (int i = 0; i < npos; i++)
1716+
{
1717+
/* At each iteration decode the position */
1718+
ptrt = decompress_pos(ptrt, &position);
1719+
1720+
/* Write this position in the string */
1721+
sprintf(positions_str_cur_ptr, "%d,", position);
1722+
1723+
/* Moving the pointer forward */
1724+
positions_str_cur_ptr += strlen(positions_str_cur_ptr);
1725+
1726+
/*
1727+
* Check that there is not too little left to the
1728+
* end of the line and, if necessary, overspend
1729+
* the memory.
1730+
*/
1731+
if (cur_max_str_lenght - (positions_str_cur_ptr - positions_str) <= POS_MAX_VAL_LENGHT)
1732+
{
1733+
cur_max_str_lenght += POS_STR_BUF_LENGHT;
1734+
positions_str = (char*) repalloc(positions_str, cur_max_str_lenght * sizeof(char));
1735+
positions_str_cur_ptr = positions_str + strlen(positions_str);
1736+
}
1737+
}
1738+
1739+
/* Delete the last comma if there has been at least one iteration of the loop */
1740+
if (npos > 0)
1741+
positions_str[strlen(positions_str) - 1] = '\0';
1742+
1743+
res = CStringGetTextDatum(positions_str);
1744+
pfree(positions_str);
1745+
return res;
1746+
}

src/rum_ts_utils.c

+2-6
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
#include "catalog/pg_type.h"
1717
#include "funcapi.h"
1818
#include "miscadmin.h"
19-
#include "tsearch/ts_type.h"
2019
#include "tsearch/ts_utils.h"
2120
#include "utils/array.h"
2221
#include "utils/builtins.h"
@@ -80,8 +79,6 @@ PG_FUNCTION_INFO_V1(rum_ts_join_pos);
8079

8180
PG_FUNCTION_INFO_V1(tsquery_to_distance_query);
8281

83-
static unsigned int count_pos(char *ptr, int len);
84-
static char *decompress_pos(char *ptr, WordEntryPos *pos);
8582
static Datum build_tsvector_entry(TSVector vector, WordEntry *we);
8683
static Datum build_tsvector_hash_entry(TSVector vector, WordEntry *we);
8784
static Datum build_tsquery_entry(TSQuery query, QueryOperand *operand);
@@ -964,7 +961,6 @@ rum_tsquery_timestamp_consistent(PG_FUNCTION_ARGS)
964961
}
965962

966963
#define SIXTHBIT 0x20
967-
#define LOWERMASK 0x1F
968964

969965
static unsigned int
970966
compress_pos(char *target, WordEntryPos *pos, int npos)
@@ -999,7 +995,7 @@ compress_pos(char *target, WordEntryPos *pos, int npos)
999995
return ptr - target;
1000996
}
1001997

1002-
static char *
998+
extern char *
1003999
decompress_pos(char *ptr, WordEntryPos *pos)
10041000
{
10051001
int i;
@@ -1027,7 +1023,7 @@ decompress_pos(char *ptr, WordEntryPos *pos)
10271023
}
10281024
}
10291025

1030-
static unsigned int
1026+
extern unsigned int
10311027
count_pos(char *ptr, int len)
10321028
{
10331029
int count = 0,

0 commit comments

Comments
 (0)