You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
205 lines
6.6 KiB
205 lines
6.6 KiB
#! /usr/bin/env python |
|
|
|
# $Id: unicode2rstsubs.py 7442 2012-06-13 23:27:03Z milde $ |
|
# Author: David Goodger <goodger@python.org> |
|
# Copyright: This program has been placed in the public domain. |
|
|
|
""" |
|
unicode2subfiles.py -- produce character entity files (reSructuredText |
|
substitutions) from the W3C master unicode.xml file. |
|
|
|
This program extracts character entity and entity set information from a |
|
unicode.xml file and produces multiple reStructuredText files (in the current |
|
directory) containing substitutions. Entity sets are from ISO 8879 & ISO |
|
9573-13 (combined), MathML, and HTML4. One or two files are produced for each |
|
entity set; a second file with a "-wide.txt" suffix is produced if there are |
|
wide-Unicode characters in the set. |
|
|
|
The input file, unicode.xml, is maintained as part of the MathML 2 |
|
Recommentation XML source, and is available from |
|
<http://www.w3.org/2003/entities/xml/>. |
|
""" |
|
|
|
import sys |
|
import os |
|
import optparse |
|
import re |
|
from xml.parsers.expat import ParserCreate |
|
|
|
|
|
usage_msg = """Usage: %s [unicode.xml]\n""" |
|
|
|
def usage(prog, status=0, msg=None): |
|
sys.stderr.write(usage_msg % prog) |
|
if msg: |
|
sys.stderr.write(msg + '\n') |
|
sys.exit(status) |
|
|
|
def main(argv=None): |
|
if argv is None: |
|
argv = sys.argv |
|
if len(argv) == 2: |
|
inpath = argv[1] |
|
elif len(argv) > 2: |
|
usage(argv[0], 2, |
|
'Too many arguments (%s): only 1 expected.' % (len(argv) - 1)) |
|
else: |
|
inpath = 'unicode.xml' |
|
if not os.path.isfile(inpath): |
|
usage(argv[0], 1, 'No such file: "%s".' % inpath) |
|
if sys.version_info >= (3,0): |
|
infile = open(inpath, mode='rb') |
|
else: |
|
infile = open(inpath) |
|
process(infile) |
|
|
|
def process(infile): |
|
grouper = CharacterEntitySetExtractor(infile) |
|
grouper.group() |
|
grouper.write_sets() |
|
|
|
|
|
class CharacterEntitySetExtractor: |
|
|
|
""" |
|
Extracts character entity information from unicode.xml file, groups it by |
|
entity set, and writes out reStructuredText substitution files. |
|
""" |
|
|
|
unwanted_entity_sets = ['stix', # unknown, buggy set |
|
'predefined'] |
|
|
|
header = """\ |
|
.. This data file has been placed in the public domain. |
|
.. Derived from the Unicode character mappings available from |
|
<http://www.w3.org/2003/entities/xml/>. |
|
Processed by unicode2rstsubs.py, part of Docutils: |
|
<http://docutils.sourceforge.net>. |
|
""" |
|
|
|
def __init__(self, infile): |
|
self.infile = infile |
|
"""Input unicode.xml file.""" |
|
|
|
self.parser = self.setup_parser() |
|
"""XML parser.""" |
|
|
|
self.elements = [] |
|
"""Stack of element names. Last is current element.""" |
|
|
|
self.sets = {} |
|
"""Mapping of charent set name to set dict.""" |
|
|
|
self.charid = None |
|
"""Current character's "id" attribute value.""" |
|
|
|
self.descriptions = {} |
|
"""Mapping of character ID to description.""" |
|
|
|
def setup_parser(self): |
|
parser = ParserCreate() |
|
parser.StartElementHandler = self.StartElementHandler |
|
parser.EndElementHandler = self.EndElementHandler |
|
parser.CharacterDataHandler = self.CharacterDataHandler |
|
return parser |
|
|
|
def group(self): |
|
self.parser.ParseFile(self.infile) |
|
|
|
def StartElementHandler(self, name, attributes): |
|
self.elements.append(name) |
|
handler = name + '_start' |
|
if hasattr(self, handler): |
|
getattr(self, handler)(name, attributes) |
|
|
|
def EndElementHandler(self, name): |
|
assert self.elements[-1] == name, \ |
|
'unknown end-tag %r (%r)' % (name, self.element) |
|
self.elements.pop() |
|
handler = name + '_end' |
|
if hasattr(self, handler): |
|
getattr(self, handler)(name) |
|
|
|
def CharacterDataHandler(self, data): |
|
handler = self.elements[-1] + '_data' |
|
if hasattr(self, handler): |
|
getattr(self, handler)(data) |
|
|
|
def character_start(self, name, attributes): |
|
self.charid = attributes['id'] |
|
|
|
def entity_start(self, name, attributes): |
|
set = self.entity_set_name(attributes['set']) |
|
if not set: |
|
return |
|
if set not in self.sets: |
|
print('bad set: %r' % set) |
|
return |
|
entity = attributes['id'] |
|
assert (entity not in self.sets[set] |
|
or self.sets[set][entity] == self.charid), \ |
|
('sets[%r][%r] == %r (!= %r)' |
|
% (set, entity, self.sets[set][entity], self.charid)) |
|
self.sets[set][entity] = self.charid |
|
|
|
def description_data(self, data): |
|
self.descriptions.setdefault(self.charid, '') |
|
self.descriptions[self.charid] += data |
|
|
|
entity_set_name_pat = re.compile(r'[0-9-]*(.+)$') |
|
"""Pattern to strip ISO numbers off the beginning of set names.""" |
|
|
|
def entity_set_name(self, name): |
|
""" |
|
Return lowcased and standard-number-free entity set name. |
|
Return ``None`` for unwanted entity sets. |
|
""" |
|
match = self.entity_set_name_pat.match(name) |
|
name = match.group(1).lower() |
|
if name in self.unwanted_entity_sets: |
|
return None |
|
self.sets.setdefault(name, {}) |
|
return name |
|
|
|
def write_sets(self): |
|
sets = list(self.sets.keys()) |
|
sets.sort() |
|
for set_name in sets: |
|
self.write_set(set_name) |
|
|
|
def write_set(self, set_name, wide=None): |
|
if wide: |
|
outname = set_name + '-wide.txt' |
|
else: |
|
outname = set_name + '.txt' |
|
outfile = open(outname, 'w') |
|
print('writing file "%s"' % outname) |
|
outfile.write(self.header + '\n') |
|
set = self.sets[set_name] |
|
entities = [(e.lower(), e) for e in set.keys()] |
|
entities.sort() |
|
longest = 0 |
|
for _, entity_name in entities: |
|
longest = max(longest, len(entity_name)) |
|
has_wide = None |
|
for _, entity_name in entities: |
|
has_wide = self.write_entity( |
|
set, set_name, entity_name, outfile, longest, wide) or has_wide |
|
if has_wide and not wide: |
|
self.write_set(set_name, 1) |
|
|
|
def write_entity(self, set, set_name, entity_name, outfile, longest, |
|
wide=None): |
|
charid = set[entity_name] |
|
if not wide: |
|
for code in charid[1:].split('-'): |
|
if int(code, 16) > 0xFFFF: |
|
return 1 # wide-Unicode character |
|
codes = ' '.join(['U+%s' % code for code in charid[1:].split('-')]) |
|
outfile.write('.. %-*s unicode:: %s .. %s\n' |
|
% (longest + 2, '|' + entity_name + '|', |
|
codes, self.descriptions[charid])) |
|
|
|
|
|
if __name__ == '__main__': |
|
sys.exit(main())
|
|
|