Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow all entities from HTML5 spec. #76

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
build/
dist/
snudown.egg-info/
src/html_entities.gperf.generated
src/html_entities.h
*.pyc
*.so
Expand Down
54 changes: 51 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from distutils.spawn import find_executable
from distutils.dep_util import newer_group
from setuptools import setup, Extension
from setuptools.command.build_ext import build_ext

import re
import os
import subprocess
import fnmatch
import json

def c_files_in(directory):
paths = []
Expand All @@ -14,11 +16,55 @@ def c_files_in(directory):
paths.append(os.path.join(directory, f))
return paths

def send_html_entities(entities_file, outfh, seen_entities):
# Convert entity list from HTML5 spec JSON and send it to gperf
with open(entities_file) as entitiesfh:
for entity, entityinfo in sorted(json.load(entitiesfh).items()):
if not entity.endswith(';'):
continue
if entity in seen_entities:
continue
seen_entities.add(entity)
# Some sanity checks on the codepoints
assert len(entityinfo['codepoints']) <= 2
for bad_range in [xrange(0, 8), xrange(11, 12), xrange(14, 31),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one's my bad, when I wrote the comment with the xranges I forgot the upper bound on xrange was exclusive. All those upper bounds should be increased by one.

xrange(55296, 57343), xrange(65534, 65535)]:
for codepoint in entityinfo['codepoints']:
assert codepoint not in bad_range
representation = ",".join(str(x) for x in entityinfo['codepoints'])
# Output the entity
outfh.write(entity + ", {" + representation + "}\n")

def process_gperf_file(gperf_file, output_file):

def process_gperf_file(gperf_file, entities_file, output_file, force=False):
if not find_executable("gperf"):
raise Exception("Couldn't find `gperf`, is it installed?")
subprocess.check_call(["gperf", gperf_file, "--output-file=%s" % output_file])

# Do not rerun gperf if no change to input files
if not force and not newer_group((gperf_file, entities_file), output_file):
return

# Output combined gperf input file
gperf_temp_file = gperf_file + ".generated"
seen_entities = set()
found_separator = 0
with open(gperf_temp_file, 'w') as outfh:
with open(gperf_file) as f:
for line in f:
entity = line.strip()
if entity == '%%':
found_separator += 1
if found_separator == 2:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: A comment explaning which number maps to which section would be helpful for people who don't know gperf well

send_html_entities(entities_file, outfh, seen_entities)
elif found_separator == 1:
entity = entity.split()[0]
seen_entities.add(entity)
outfh.write(line)
if found_separator < 2:
send_html_entities(entities_file, outfh, seen_entities)

subprocess.check_call(["gperf", gperf_temp_file,
"--output-file", output_file])

version = None
version_re = re.compile(r'^#define\s+SNUDOWN_VERSION\s+"([^"]+)"$')
Expand All @@ -32,7 +78,8 @@ def process_gperf_file(gperf_file, output_file):

class GPerfingBuildExt(build_ext):
def run(self):
process_gperf_file("src/html_entities.gperf", "src/html_entities.h")
process_gperf_file("src/html_entities.gperf", "src/html_entities.json",
"src/html_entities.h", force=self.force)
build_ext.run(self)

setup(
Expand All @@ -47,6 +94,7 @@ def run(self):
Extension(
name='snudown',
sources=['snudown.c'] + c_files_in('src/') + c_files_in('html/'),
depends=['src/html_entities.h'],
include_dirs=['src', 'html']
)
],
Expand Down
12 changes: 12 additions & 0 deletions src/html_entities.gperf
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,20 @@ inline int is_valid_numeric_entity(uint32_t entity_val)
&& entity_val <= MAX_NUM_ENTITY_VAL);
}

/* Maximum number of codepoints for a named entity. */
#define MAX_ENTITY_CODEPOINTS 2
%}
%struct-type
%define slot-name entity
struct html_entity {
const char *entity;
const int codepoints[MAX_ENTITY_CODEPOINTS];
}
%%
# Additional entities are supported by HTML5, which are converted by
# Snudown to numeric entities to maintain XHTML compatibility. The HTML5
# entities are inserted by setup.py based on html_entities.json. Source:
# http://www.w3.org/TR/2014/REC-html5-20141028/entities.json
&AElig;
&Aacute;
&Acirc;
Expand Down
Loading