Viewing file: Strings.py (12.35 KB) -rw-r--r-- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
######################################################################## # $Source: /var/local/cvsroot/4Suite/Ft/Xml/Xslt/Exslt/Strings.py,v $ $Revision: 1.23 $ $Date: 2005/02/09 11:21:20 $ """ EXSLT - Strings
Copyright 2005 Fourthought, Inc. (USA). Detailed license and copyright information: http://4suite.org/COPYRIGHT Project home, documentation, distributions: http://4suite.org/ """
import urllib, re, codecs from Ft.Xml.XPath import Conversions, NAMESPACE_NODE from Ft.Xml.Xslt import XsltRuntimeException, Error from Ft.Xml.Xslt.CopyOfElement import CopyNode
EXSL_STRINGS_NS = "http://exslt.org/strings"
def Align(context, target, padding, alignment=''): """ The str:align function aligns a string within another string.
See http://exslt.org/str/functions/align/str.align.html for further explanation. """ target = Conversions.StringValue(target) padding = Conversions.StringValue(padding) alignment = alignment and Conversions.StringValue(alignment)
# If the target string is longer than the padding string, then it is # truncated to be the same length as the padding string and returned. if len(target) > len(padding): return target[:len(padding)]
# If no third argument is given or if it is not one of 'left', 'right' # or 'center', then it defaults to left alignment. if alignment == 'right': result = padding[:-len(target)] + target elif alignment == 'center': # With center alignment, the range of characters replaced by the target # string is in the middle of the padding string, such that either the # number of unreplaced characters on either side of the range is the # same or there is one less on the left than there is on the right. left = (len(padding) - len(target)) / 2 right = left + len(target) result = padding[:left] + target + padding[right:] else: result = target + padding[len(target):] return result
def Concat(context, nodeset): """ The str:concat function takes a node set and returns the concatenation of the string values of the nodes in that node set. If the node set is empty, it returns an empty string. """ if type(nodeset) != type([]): raise XsltRuntimeException(Error.WRONG_ARGUMENT_TYPE, context.currentInstruction)
strings = map(Conversions.StringValue, nodeset) return u''.join(strings)
def DecodeUri(context, uri, encoding=u'UTF-8'): """ The str:decode-uri function decodes a percent-encoded string, such as one would find in a URI. """ uri = Conversions.StringValue(uri) encoding = Conversions.StringValue(encoding) try: decoder = codecs.lookup(encoding)[1] except LookupError: # Unsupported encoding return u''
def repl(match, decoder=decoder): # Remove the leading '%' sequence = match.group()[1:] # There may be multiple encoded characters that are required # to produce a single Unicode character. ordinals = sequence.split('%') characters = [ chr(int(ordinal, 16)) for ordinal in ordinals ]
# Ignore any invalid sequences in this encoding return decoder(''.join(characters), 'ignore')[0]
return re.sub('(?:%[0-9a-fA-F]{2})+', repl, uri)
_unreserved = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' '0123456789' "-_.!~*'()" '%') # not really unreserved, but handled specially before these
_reserved = ';/?:@&=+$,[]'
def EncodeUri(context, uri, escapeReserved, encoding=u'UTF-8'): """ The str:encode-uri function percent-encodes a string for embedding in a URI. The second argument is a boolean indicating whether to escape reserved characters; if true, the given string can be a URI already, with just some of its characters needing to be escaped (not recommended, but users who don't understand the nuances of the URI syntax tend to prefer it over assembling a URI piece-by-piece). """ uri = Conversions.StringValue(uri) escape_reserved = Conversions.BooleanValue(escapeReserved) encoding = Conversions.StringValue(encoding)
try: encoder = codecs.lookup(encoding)[0] except LookupError: return u''
# The "%" is escaped only if it is not followed by two hexadecimal digits. uri = re.sub('%(?![0-9A-Fa-f]{2})', u'%25', uri)
safe = _unreserved if not escape_reserved: safe += _reserved
res = list(uri) for i in xrange(len(res)): c = res[i] if c not in safe: try: if ord(c) > 127: encoded = encoder(c, 'strict')[0] else: encoded = chr(ord(c)) except UnicodeError: # Not valid in this encoding encoded = '%3F' else: # The Unicode character could map to multiple bytes encoded = u''.join([ '%%%02X' % ord(c) for c in encoded ]) res[i] = encoded return u''.join(res)
def Padding(context, length, chars=''): """ The str:padding function creates a padding string of a certain length.
The second argument gives a string to be used to create the padding. This string is repeated as many times as is necessary to create a string of the length specified by the first argument; if the string is more than a character long, it may have to be truncated to produce the required length. If no second argument is specified, it defaults to a space (' '). """ length = Conversions.NumberValue(length) chars = chars and Conversions.StringValue(chars) or u' '
return (chars*length)[:length]
def Replace(context, s, searchNodes, replNodes): """ The str:replace function converts a string to a node-set, with each instance of a substring from a given list (obtained from the string-values of nodes in the second argument) replaced by the node at the corresponding position of the node-set given as the third argument. Unreplaced substrings become text nodes.
The second and third arguments can be any type of object; if either is not a node-set, it is treated as if it were a node-set of just one text node, formed from the object's string-value.
Attribute and namespace nodes in the replacement set are erroneous but are treated as empty text nodes.
All occurrences of the longest substrings are replaced first, and once a replacement is made, that span of the original string is no longer eligible for future replacements.
An empty search string matches between every character of the original string.
See http://exslt.org/str/functions/replace/str.replace.html for details. """ # prepare a list of strings to search for (based on searchNodeSet) if type(searchNodes) is not type([]): search_set = [Conversions.StringValue(searchNodes)] else: search_set = map(Conversions.StringValue, searchNodes)
# prepare a list of replacement nodes for each search string (based on replNodes) if type(replNodes) is not type([]): replace_set = [context.node.createTextNode(Conversions.StringValue(replNodes))] else: # use replNodes but replace attr, ns nodes with empty text nodes replace_set = [(n.nodeType == n.ATTRIBUTE_NODE or n.nodeType == NAMESPACE_NODE) and context.node.createTextNode(u'') or n for n in replNodes]
# make a list of tuples that map each search string to a replacement node or None replacements = map(None, search_set, replace_set) replacements = [tup for tup in replacements if tup[0]]
# Sort the tuples in ascending order by length of string. # So that the longest search strings will be replaced first, # we will process it in reverse order (it may be more efficient to # pop items off the end of a list; see # http://groups.google.com/groups?selm=3DE41EBE.B60BA9FE%40alcyone.com replacements.sort(lambda a, b: cmp(len(a[0]), len(b[0])))
# generate a result tree fragment processor = context.processor processor.pushResultTree(context.currentInstruction.baseUri) try: _replace(s, replacements, processor) finally: rtf = processor.popResult()
# return it as a node-set return rtf.childNodes
def _replace(s, replmap, processor): """ Supports str:replace(). s is a string. replmap is a list of tuples, where each tuple is a search string and a replacement node or None. This recursive function will cause the original string to have occurrences of the search strings replaced with the corresponding node or deleted. When a replacement is made, that portion of the original string is no longer available for further replacements. All replacements are made for each search string before moving on to the next. Empty search strings match in between every character of the original string. """ # rm is a locally-scoped copy of replmap rm = replmap[:] if rm: sr = rm.pop() if sr[0]: nms = s.split(sr[0]) else: nms = [c for c in s] last_i = len(nms) - 1 for i in xrange(len(nms)): if nms[i]: _replace(nms[i], rm, processor) if i < last_i and sr[1]: CopyNode(processor, sr[1]) else: processor.writers[-1].text(s) return
def Split(context, string, pattern=u' '): """ The str:split function splits up a string and returns a node set of token elements, each containing one token from the string.
The first argument is the string to be split. The second argument is a pattern string (default=' '). The string given by the first argument is split at any occurrence of this pattern. An empty string pattern will result in a split on every character in the string. """ string = Conversions.StringValue(string) pattern = Conversions.StringValue(pattern) processor = context.processor processor.pushResultTree(context.currentInstruction.baseUri) try: if pattern: for token in string.split(pattern): processor.writers[-1].startElement(u'token') processor.writers[-1].text(token) processor.writers[-1].endElement(u'token') else: for ch in string: processor.writers[-1].startElement(u'token') processor.writers[-1].text(ch) processor.writers[-1].endElement(u'token') finally: rtf = processor.popResult() return rtf.childNodes
def Tokenize(context, string, delimiters=''): """ The str:tokenize function splits up a string and returns a node set of 'token' elements, each containing one token from the string.
The first argument is the string to be tokenized. The second argument is a string consisting of a number of characters. Each character in this string is taken as a delimiting character. The string given by the first argument is split at any occurrence of any of these characters. """ string = Conversions.StringValue(string) delim = delimiters and Conversions.StringValue(delimiters) or '\t\r\n '
processor = context.processor processor.pushResultTree(context.currentInstruction.baseUri) try: token = u'' for ch in string: if ch not in delim: token += ch elif token: # write out characters gathered up to now processor.writers[-1].startElement(u'token') processor.writers[-1].text(token) processor.writers[-1].endElement(u'token') token = u'' if token: processor.writers[-1].startElement(u'token') processor.writers[-1].text(token) processor.writers[-1].endElement(u'token') finally: rtf = processor.popResult() return rtf.childNodes
ExtNamespaces = { EXSL_STRINGS_NS : 'str', }
ExtFunctions = { (EXSL_STRINGS_NS, 'align') : Align, (EXSL_STRINGS_NS, 'concat') : Concat, (EXSL_STRINGS_NS, 'decode-uri') : DecodeUri, (EXSL_STRINGS_NS, 'encode-uri') : EncodeUri, (EXSL_STRINGS_NS, 'padding') : Padding, (EXSL_STRINGS_NS, 'replace') : Replace, (EXSL_STRINGS_NS, 'split') : Split, (EXSL_STRINGS_NS, 'tokenize') : Tokenize, }
ExtElements = {}
|