You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
206 lines
6.6 KiB
206 lines
6.6 KiB
7 years ago
|
#! /usr/bin/env python
|
||
|
|
||
|
# $Id: unicode2rstsubs.py 7442 2012-06-13 23:27:03Z milde $
|
||
|
# Author: David Goodger <goodger@python.org>
|
||
|
# Copyright: This program has been placed in the public domain.
|
||
|
|
||
|
"""
|
||
|
unicode2subfiles.py -- produce character entity files (reSructuredText
|
||
|
substitutions) from the W3C master unicode.xml file.
|
||
|
|
||
|
This program extracts character entity and entity set information from a
|
||
|
unicode.xml file and produces multiple reStructuredText files (in the current
|
||
|
directory) containing substitutions. Entity sets are from ISO 8879 & ISO
|
||
|
9573-13 (combined), MathML, and HTML4. One or two files are produced for each
|
||
|
entity set; a second file with a "-wide.txt" suffix is produced if there are
|
||
|
wide-Unicode characters in the set.
|
||
|
|
||
|
The input file, unicode.xml, is maintained as part of the MathML 2
|
||
|
Recommentation XML source, and is available from
|
||
|
<http://www.w3.org/2003/entities/xml/>.
|
||
|
"""
|
||
|
|
||
|
import sys
|
||
|
import os
|
||
|
import optparse
|
||
|
import re
|
||
|
from xml.parsers.expat import ParserCreate
|
||
|
|
||
|
|
||
|
usage_msg = """Usage: %s [unicode.xml]\n"""
|
||
|
|
||
|
def usage(prog, status=0, msg=None):
|
||
|
sys.stderr.write(usage_msg % prog)
|
||
|
if msg:
|
||
|
sys.stderr.write(msg + '\n')
|
||
|
sys.exit(status)
|
||
|
|
||
|
def main(argv=None):
|
||
|
if argv is None:
|
||
|
argv = sys.argv
|
||
|
if len(argv) == 2:
|
||
|
inpath = argv[1]
|
||
|
elif len(argv) > 2:
|
||
|
usage(argv[0], 2,
|
||
|
'Too many arguments (%s): only 1 expected.' % (len(argv) - 1))
|
||
|
else:
|
||
|
inpath = 'unicode.xml'
|
||
|
if not os.path.isfile(inpath):
|
||
|
usage(argv[0], 1, 'No such file: "%s".' % inpath)
|
||
|
if sys.version_info >= (3,0):
|
||
|
infile = open(inpath, mode='rb')
|
||
|
else:
|
||
|
infile = open(inpath)
|
||
|
process(infile)
|
||
|
|
||
|
def process(infile):
|
||
|
grouper = CharacterEntitySetExtractor(infile)
|
||
|
grouper.group()
|
||
|
grouper.write_sets()
|
||
|
|
||
|
|
||
|
class CharacterEntitySetExtractor:
|
||
|
|
||
|
"""
|
||
|
Extracts character entity information from unicode.xml file, groups it by
|
||
|
entity set, and writes out reStructuredText substitution files.
|
||
|
"""
|
||
|
|
||
|
unwanted_entity_sets = ['stix', # unknown, buggy set
|
||
|
'predefined']
|
||
|
|
||
|
header = """\
|
||
|
.. This data file has been placed in the public domain.
|
||
|
.. Derived from the Unicode character mappings available from
|
||
|
<http://www.w3.org/2003/entities/xml/>.
|
||
|
Processed by unicode2rstsubs.py, part of Docutils:
|
||
|
<http://docutils.sourceforge.net>.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, infile):
|
||
|
self.infile = infile
|
||
|
"""Input unicode.xml file."""
|
||
|
|
||
|
self.parser = self.setup_parser()
|
||
|
"""XML parser."""
|
||
|
|
||
|
self.elements = []
|
||
|
"""Stack of element names. Last is current element."""
|
||
|
|
||
|
self.sets = {}
|
||
|
"""Mapping of charent set name to set dict."""
|
||
|
|
||
|
self.charid = None
|
||
|
"""Current character's "id" attribute value."""
|
||
|
|
||
|
self.descriptions = {}
|
||
|
"""Mapping of character ID to description."""
|
||
|
|
||
|
def setup_parser(self):
|
||
|
parser = ParserCreate()
|
||
|
parser.StartElementHandler = self.StartElementHandler
|
||
|
parser.EndElementHandler = self.EndElementHandler
|
||
|
parser.CharacterDataHandler = self.CharacterDataHandler
|
||
|
return parser
|
||
|
|
||
|
def group(self):
|
||
|
self.parser.ParseFile(self.infile)
|
||
|
|
||
|
def StartElementHandler(self, name, attributes):
|
||
|
self.elements.append(name)
|
||
|
handler = name + '_start'
|
||
|
if hasattr(self, handler):
|
||
|
getattr(self, handler)(name, attributes)
|
||
|
|
||
|
def EndElementHandler(self, name):
|
||
|
assert self.elements[-1] == name, \
|
||
|
'unknown end-tag %r (%r)' % (name, self.element)
|
||
|
self.elements.pop()
|
||
|
handler = name + '_end'
|
||
|
if hasattr(self, handler):
|
||
|
getattr(self, handler)(name)
|
||
|
|
||
|
def CharacterDataHandler(self, data):
|
||
|
handler = self.elements[-1] + '_data'
|
||
|
if hasattr(self, handler):
|
||
|
getattr(self, handler)(data)
|
||
|
|
||
|
def character_start(self, name, attributes):
|
||
|
self.charid = attributes['id']
|
||
|
|
||
|
def entity_start(self, name, attributes):
|
||
|
set = self.entity_set_name(attributes['set'])
|
||
|
if not set:
|
||
|
return
|
||
|
if set not in self.sets:
|
||
|
print('bad set: %r' % set)
|
||
|
return
|
||
|
entity = attributes['id']
|
||
|
assert (entity not in self.sets[set]
|
||
|
or self.sets[set][entity] == self.charid), \
|
||
|
('sets[%r][%r] == %r (!= %r)'
|
||
|
% (set, entity, self.sets[set][entity], self.charid))
|
||
|
self.sets[set][entity] = self.charid
|
||
|
|
||
|
def description_data(self, data):
|
||
|
self.descriptions.setdefault(self.charid, '')
|
||
|
self.descriptions[self.charid] += data
|
||
|
|
||
|
entity_set_name_pat = re.compile(r'[0-9-]*(.+)$')
|
||
|
"""Pattern to strip ISO numbers off the beginning of set names."""
|
||
|
|
||
|
def entity_set_name(self, name):
|
||
|
"""
|
||
|
Return lowcased and standard-number-free entity set name.
|
||
|
Return ``None`` for unwanted entity sets.
|
||
|
"""
|
||
|
match = self.entity_set_name_pat.match(name)
|
||
|
name = match.group(1).lower()
|
||
|
if name in self.unwanted_entity_sets:
|
||
|
return None
|
||
|
self.sets.setdefault(name, {})
|
||
|
return name
|
||
|
|
||
|
def write_sets(self):
|
||
|
sets = list(self.sets.keys())
|
||
|
sets.sort()
|
||
|
for set_name in sets:
|
||
|
self.write_set(set_name)
|
||
|
|
||
|
def write_set(self, set_name, wide=None):
|
||
|
if wide:
|
||
|
outname = set_name + '-wide.txt'
|
||
|
else:
|
||
|
outname = set_name + '.txt'
|
||
|
outfile = open(outname, 'w')
|
||
|
print('writing file "%s"' % outname)
|
||
|
outfile.write(self.header + '\n')
|
||
|
set = self.sets[set_name]
|
||
|
entities = [(e.lower(), e) for e in set.keys()]
|
||
|
entities.sort()
|
||
|
longest = 0
|
||
|
for _, entity_name in entities:
|
||
|
longest = max(longest, len(entity_name))
|
||
|
has_wide = None
|
||
|
for _, entity_name in entities:
|
||
|
has_wide = self.write_entity(
|
||
|
set, set_name, entity_name, outfile, longest, wide) or has_wide
|
||
|
if has_wide and not wide:
|
||
|
self.write_set(set_name, 1)
|
||
|
|
||
|
def write_entity(self, set, set_name, entity_name, outfile, longest,
|
||
|
wide=None):
|
||
|
charid = set[entity_name]
|
||
|
if not wide:
|
||
|
for code in charid[1:].split('-'):
|
||
|
if int(code, 16) > 0xFFFF:
|
||
|
return 1 # wide-Unicode character
|
||
|
codes = ' '.join(['U+%s' % code for code in charid[1:].split('-')])
|
||
|
outfile.write('.. %-*s unicode:: %s .. %s\n'
|
||
|
% (longest + 2, '|' + entity_name + '|',
|
||
|
codes, self.descriptions[charid]))
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
sys.exit(main())
|