Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions docs/encoding.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# RBS File Encoding

## Best Practice

**Use UTF-8** for both file encoding and your system locale.

## Supported Encodings

RBS parser supports ASCII-compatible encodings (similar to Ruby's script encoding support).

**Examples**: UTF-8, US-ASCII, Shift JIS, EUC-JP, ...

## Unicode Codepoint Symbols

String literal types in RBS can contain Unicode codepoint escape sequences (`\uXXXX`).

When the file encoding is UTF-8, the parser translates Unicode codepoint symbols:

```rbs
# In UTF-8 encoded files

type t = "\u0123" # Translated to the actual Unicode character ģ
type s = "\u3042" # Translated to the actual Unicode character あ
```

When the file encoding is not UTF-8, Unicode escape sequences are interpreted literally as the string `\uXXXX`:

```rbs
# In non-UTF-8 encoded files

type t = "\u0123" # Remains as the literal string "\u0123"
```

## Implementation

RBS gem currently doesn't do anything for file encoding. It relies on Ruby's encoding handling, specifically `Encoding.default_external` and `Encoding.default_internal`.

`Encoding.default_external` is the encoding Ruby assumes when it reads external resources like files. The Ruby interpreter sets it based on the locale. `Encoding.default_internal` is the encoding Ruby converts the external resources to. The default is `nil` (no conversion.)

When your locale is set to use `UTF-8` encoding, `default_external` is `Encoding::UTF_8`. So the RBS file content read from the disk will have UTF-8 encoding.

### Parsing non UTF-8 RBS source text

If you want to work with another encoding, ensure the source string has ASCII compatible encoding.

```ruby
source = '"日本語"'
RBS::Parser.parse_type(source.encode(Encoding::EUC_JP)) # => Parses successfully
RBS::Parser.parse_type(source.encode(Encoding::UTF_32)) # => Returns `nil` since UTF-32 is not ASCII compatible
```

### Specifying file encoding

Currently, RBS doesn't support specifying file encoding directly.

You can use `Encoding.default_external` while the gem loads RBS files from the storage.
2 changes: 0 additions & 2 deletions ext/rbs_extension/class_constants.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@

#include "rbs_extension.h"

VALUE RBS_Parser;

VALUE RBS;
VALUE RBS_AST;
VALUE RBS_AST_Declarations;
Expand Down
10 changes: 5 additions & 5 deletions ext/rbs_extension/legacy_location.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ void rbs_loc_legacy_alloc_children(rbs_loc *loc, unsigned short cap) {
check_children_max(cap);

size_t s = RBS_LOC_CHILDREN_SIZE(cap);
loc->children = malloc(s);
loc->children = (rbs_loc_children *) malloc(s);

*loc->children = (rbs_loc_children) {
.len = 0,
Expand All @@ -50,7 +50,7 @@ static void check_children_cap(rbs_loc *loc) {
if (loc->children->len == loc->children->cap) {
check_children_max(loc->children->cap + 1);
size_t s = RBS_LOC_CHILDREN_SIZE(++loc->children->cap);
loc->children = realloc(loc->children, s);
loc->children = (rbs_loc_children *) realloc(loc->children, s);
}
}
}
Expand Down Expand Up @@ -86,12 +86,12 @@ void rbs_loc_free(rbs_loc *loc) {
}

static void rbs_loc_mark(void *ptr) {
rbs_loc *loc = ptr;
rbs_loc *loc = (rbs_loc *) ptr;
rb_gc_mark(loc->buffer);
}

static size_t rbs_loc_memsize(const void *ptr) {
const rbs_loc *loc = ptr;
const rbs_loc *loc = (const rbs_loc *) ptr;
if (loc->children == NULL) {
return sizeof(rbs_loc);
} else {
Expand All @@ -117,7 +117,7 @@ static VALUE location_s_allocate(VALUE klass) {
}

rbs_loc *rbs_check_location(VALUE obj) {
return rb_check_typeddata(obj, &location_type);
return (rbs_loc *) rb_check_typeddata(obj, &location_type);
}

static VALUE location_initialize(VALUE self, VALUE buffer, VALUE start_pos, VALUE end_pos) {
Expand Down
10 changes: 1 addition & 9 deletions ext/rbs_extension/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -187,18 +187,10 @@ static VALUE parse_method_type_try(VALUE a) {
}

rbs_method_type_t *method_type = NULL;
rbs_parse_method_type(parser, &method_type);
rbs_parse_method_type(parser, &method_type, RB_TEST(arg->require_eof));

raise_error_if_any(parser, arg->buffer);

if (RB_TEST(arg->require_eof)) {
rbs_parser_advance(parser);
if (parser->current_token.type != pEOF) {
rbs_parser_set_error(parser, parser->current_token, true, "expected a token `%s`", rbs_token_type_str(pEOF));
raise_error(parser->error, arg->buffer);
}
}

rbs_translation_context_t ctx = rbs_translation_context_create(
&parser->constant_pool,
arg->buffer,
Expand Down
4 changes: 2 additions & 2 deletions include/rbs/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ typedef struct rbs_error_t {
* An RBS parser is a LL(3) parser.
* */
typedef struct {
rbs_lexer_t *rbs_lexer_t;
rbs_lexer_t *lexer;

rbs_token_t current_token;
rbs_token_t next_token; /* The first lookahead token */
Expand Down Expand Up @@ -127,7 +127,7 @@ rbs_ast_comment_t *rbs_parser_get_comment(rbs_parser_t *parser, int subject_line
void rbs_parser_set_error(rbs_parser_t *parser, rbs_token_t tok, bool syntax_error, const char *fmt, ...) RBS_ATTRIBUTE_FORMAT(4, 5);

bool rbs_parse_type(rbs_parser_t *parser, rbs_node_t **type);
bool rbs_parse_method_type(rbs_parser_t *parser, rbs_method_type_t **method_type);
bool rbs_parse_method_type(rbs_parser_t *parser, rbs_method_type_t **method_type, bool require_eof);
bool rbs_parse_signature(rbs_parser_t *parser, rbs_signature_t **signature);

bool rbs_parse_type_params(rbs_parser_t *parser, bool module_type_params, rbs_node_list_t **params);
Expand Down
2 changes: 0 additions & 2 deletions include/rbs/string.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,4 @@ size_t rbs_string_len(const rbs_string_t self);
*/
bool rbs_string_equal(const rbs_string_t lhs, const rbs_string_t rhs);

unsigned int rbs_utf8_string_to_codepoint(const rbs_string_t string);

#endif
3 changes: 2 additions & 1 deletion include/rbs/util/rbs_unescape.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <stddef.h>
#include "rbs/util/rbs_allocator.h"
#include "rbs/string.h"
#include "rbs/util/rbs_encoding.h"

/**
* Receives `rbs_parser_t` and `range`, which represents a string token or symbol token, and returns a string VALUE.
Expand All @@ -18,6 +19,6 @@
*
* @returns A new owned string that will be freed when the allocator is freed.
* */
rbs_string_t rbs_unquote_string(rbs_allocator_t *, const rbs_string_t input);
rbs_string_t rbs_unquote_string(rbs_allocator_t *, const rbs_string_t input, const rbs_encoding_t *encoding);

#endif // RBS_RBS_UNESCAPE_H
2 changes: 1 addition & 1 deletion src/location.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
void rbs_loc_alloc_children(rbs_allocator_t *allocator, rbs_location_t *loc, size_t capacity) {
RBS_ASSERT(capacity <= sizeof(rbs_loc_entry_bitmap) * 8, "Capacity %zu is too large. Max is %zu", capacity, sizeof(rbs_loc_entry_bitmap) * 8);

loc->children = rbs_allocator_malloc_impl(allocator, RBS_LOC_CHILDREN_SIZE(capacity), rbs_alignof(rbs_loc_children));
loc->children = (rbs_loc_children *) rbs_allocator_malloc_impl(allocator, RBS_LOC_CHILDREN_SIZE(capacity), rbs_alignof(rbs_loc_children));

loc->children->len = 0;
loc->children->required_p = 0;
Expand Down
Loading
Loading