1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17  """Support for regular expressions conformant to the XML Schema specification. 
 18   
 19  For the most part, XML regular expressions are similar to the POSIX 
 20  ones, and can be handled by the Python C{re} module.  The exceptions 
 21  are for multi-character (C{\w}) and category escapes (e.g., C{\p{N}} or 
 22  C{\p{IPAExtensions}}) and the character set subtraction capability. 
 23  This module supports those by scanning the regular expression, 
 24  replacing the category escapes with equivalent charset expressions. 
 25  It further detects the subtraction syntax and modifies the charset 
 26  expression to remove the unwanted code points. 
 27   
 28  The basic technique is to step through the characters of the regular 
 29  expression, entering a recursive-descent parser when one of the 
 30  translated constructs is encountered. 
 31   
 32  There is a nice set of XML regular expressions at 
 33  U{http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xsd}, 
 34  with a sample document at U{ 
 35  http://www.xmlschemareference.com/examples/Ch14/regexpDemo.xml}""" 
 36   
 37  import re 
 38  import logging 
 39  import pyxb.utils.unicode 
 40  from pyxb.utils import six 
 41   
 42  _log = logging.getLogger(__name__) 
 43   
 44   
 45   
 46  _AllEsc = { } 
 47   
 64  _InitializeAllEsc() 
 65   
 67      """Raised when a regular expression cannot be processed..""" 
 68 -    def __init__ (self, position, description): 
  69          self.position = position 
 70          ValueError.__init__(self, 'At %d: %s' % (position, description)) 
   71   
 72  _CharClassEsc_re = re.compile(r'\\(?:(?P<cgProp>[pP]{(?P<charProp>[-A-Za-z0-9]+)})|(?P<cgClass>[^pP]))') 
 74      """Parse a U{charClassEsc<http://www.w3.org/TR/xmlschema-2/#nt-charClassEsc>} term. 
 75   
 76      This is one of: 
 77   
 78        - U{SingleCharEsc<http://www.w3.org/TR/xmlschema-2/#nt-SingleCharEsc>}, 
 79        an escaped single character such as C{E{\}n} 
 80   
 81        - U{MultiCharEsc<http://www.w3.org/TR/xmlschema-2/#nt-MultiCharEsc>}, 
 82        an escape code that can match a range of characters, 
 83        e.g. C{E{\}s} to match certain whitespace characters 
 84   
 85        - U{catEsc<http://www.w3.org/TR/xmlschema-2/#nt-catEsc>}, the 
 86        C{E{\}pE{lb}...E{rb}} Unicode property escapes including 
 87        categories and blocks 
 88   
 89        - U{complEsc<http://www.w3.org/TR/xmlschema-2/#nt-complEsc>}, 
 90        the C{E{\}PE{lb}...E{rb}} inverted Unicode property escapes 
 91   
 92      If the parsing fails, throws a RegularExpressionError. 
 93   
 94      @return: A pair C{(cps, p)} where C{cps} is a 
 95      L{pyxb.utils.unicode.CodePointSet} containing the code points 
 96      associated with the character class, and C{p} is the text offset 
 97      immediately following the escape sequence. 
 98   
 99      @raise RegularExpressionError: if the expression is syntactically 
100      invalid. 
101      """ 
102   
103      mo = _CharClassEsc_re.match(text, position) 
104      if mo: 
105          escape_code = mo.group(0) 
106          cps = _AllEsc.get(escape_code) 
107          if cps is not None: 
108              return (cps, mo.end()) 
109          char_prop = mo.group('charProp') 
110          if char_prop is not None: 
111              if char_prop.startswith('Is'): 
112                  raise RegularExpressionError(position, 'Unrecognized Unicode block %s in %s' % (char_prop[2:], escape_code)) 
113              raise RegularExpressionError(position, 'Unrecognized character property %s' % (escape_code,)) 
114          raise RegularExpressionError(position, 'Unrecognized character class %s' % (escape_code,)) 
115      raise RegularExpressionError(position, "Unrecognized escape identifier at %s" % (text[position:],)) 
 116   
118      '''Parse a U{posCharGroup<http://www.w3.org/TR/xmlschema-2/#nt-posCharGroup>} term. 
119   
120      @return: A tuple C{(cps, fs, p)} where: 
121        - C{cps} is a L{pyxb.utils.unicode.CodePointSet} containing the code points associated with the group; 
122        - C{fs} is a C{bool} that is C{True} if the next character is the C{-} in a U{charClassSub<http://www.w3.org/TR/xmlschema-2/#nt-charClassSub>} and C{False} if the group is not part of a charClassSub; 
123        - C{p} is the text offset immediately following the closing brace. 
124   
125      @raise RegularExpressionError: if the expression is syntactically 
126      invalid. 
127      ''' 
128   
129      start_position = position 
130   
131       
132       
133      class DashClass: 
134          pass 
 135      DASH = DashClass() 
136   
137       
138      tokens = [] 
139      has_following_subtraction = False 
140      while True: 
141          if position >= len(text): 
142              raise RegularExpressionError(position, "Incomplete character class expression, missing closing ']'") 
143          ch = text[position] 
144          if ch == six.u('['): 
145               
146              if not tokens or tokens[-1] is not DASH: 
147                  raise RegularExpressionError(position, "'[' character not allowed in character class") 
148              has_following_subtraction = True 
149               
150               
151              tokens.pop() 
152              position = position - 1 
153              break 
154          elif ch == six.u(']'): 
155               
156              break 
157          elif ch == six.unichr(0x5c):  
158              cps, position = _MatchCharClassEsc(text, position) 
159              single_char = cps.asSingleCharacter() 
160              if single_char is not None: 
161                  tokens.append(single_char) 
162              else: 
163                  tokens.append(cps) 
164          elif ch == six.u('-'): 
165               
166               
167              tokens.append(DASH) 
168              position = position + 1 
169          else: 
170              tokens.append(ch) 
171              position = position + 1 
172   
173      if not tokens: 
174          raise RegularExpressionError(position, "Empty character class not allowed") 
175   
176       
177      if tokens[0] is DASH: 
178          tokens[0] = six.u('-') 
179      if tokens[-1] is DASH: 
180          tokens[-1] = six.u('-') 
181      result_cps = pyxb.utils.unicode.CodePointSet() 
182      cur_token = 0 
183      while cur_token < len(tokens): 
184          start = tokens[cur_token] 
185          if cur_token + 2 < len(tokens) and tokens[cur_token + 1] is DASH: 
186              end = tokens[cur_token + 2] 
187              if not isinstance(start, six.text_type) or not isinstance(end, six.text_type): 
188                  if start is DASH or end is DASH: 
189                      raise RegularExpressionError(start_position, 'Two dashes in a row is not allowed in the middle of a character class.') 
190                  raise RegularExpressionError(start_position, 'Dashes must be surrounded by characters, not character class escapes. %r %r' %(start, end)) 
191              if start > end: 
192                  raise RegularExpressionError(start_position, 'Character ranges must have the lowest character first') 
193              result_cps.add((ord(start), ord(end))) 
194              cur_token = cur_token + 3 
195          else: 
196              if start is DASH: 
197                  raise RegularExpressionError(start_position, 'Dash without an initial character') 
198              elif isinstance(start, six.text_type): 
199                  result_cps.add(ord(start)) 
200              else: 
201                  result_cps.extend(start) 
202              cur_token = cur_token + 1 
203   
204      return result_cps, has_following_subtraction, position 
205   
207      '''Parse a U{charClassExpr<http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>}. 
208   
209      These are XML regular expression classes such as C{[abc]}, C{[a-c]}, C{[^abc]}, or C{[a-z-[q]]}. 
210   
211      @param text: The complete text of the regular expression being 
212      translated.  The first character must be the C{[} starting a 
213      character class. 
214   
215      @param position: The offset of the start of the character group. 
216   
217      @return: A pair C{(cps, p)} where C{cps} is a 
218      L{pyxb.utils.unicode.CodePointSet} containing the code points 
219      associated with the property, and C{p} is the text offset 
220      immediately following the closing brace. 
221   
222      @raise RegularExpressionError: if the expression is syntactically 
223      invalid. 
224      ''' 
225      if position >= len(text): 
226          raise RegularExpressionError(position, 'Missing character class expression') 
227      if six.u('[') != text[position]: 
228          raise RegularExpressionError(position, "Expected start of character class expression, got '%s'" % (text[position],)) 
229      position = position + 1 
230      if position >= len(text): 
231          raise RegularExpressionError(position, 'Missing character class expression') 
232      negated = (text[position] == '^') 
233      if negated: 
234          position = position + 1 
235   
236      result_cps, has_following_subtraction, position = _MatchPosCharGroup(text, position) 
237   
238      if negated: 
239          result_cps = result_cps.negate() 
240   
241      if has_following_subtraction: 
242          assert text[position] == six.u('-') 
243          assert text[position + 1] == six.u('[') 
244          position = position + 1 
245          sub_cps, position = _MatchCharClassExpr(text, position) 
246          result_cps.subtract(sub_cps) 
247   
248      if position >= len(text) or text[position] != six.u(']'): 
249          raise RegularExpressionError(position, "Expected ']' to end character class") 
250      return result_cps, position + 1 
 251   
253      """Attempt to match a U{character class expression 
254      <http://www.w3.org/TR/xmlschema-2/#nt-charClassExpr>}. 
255   
256      @param text: The complete text of the regular expression being 
257      translated 
258   
259      @param position: The offset of the start of the potential 
260      expression. 
261   
262      @return: C{None} if C{position} does not begin a character class 
263      expression; otherwise a pair C{(cps, p)} where C{cps} is a 
264      L{pyxb.utils.unicode.CodePointSet} containing the code points associated with 
265      the property, and C{p} is the text offset immediately following 
266      the closing brace.""" 
267      if position >= len(text): 
268          return None 
269      c = text[position] 
270      np = position + 1 
271      if '.' == c: 
272          return (pyxb.utils.unicode.WildcardEsc, np) 
273      if '[' == c: 
274          return _MatchCharClassExpr(text, position) 
275      if '\\' == c: 
276          return _MatchCharClassEsc(text, position) 
277      return None 
 278   
280      """Convert the given pattern to the format required for Python 
281      regular expressions. 
282   
283      @param pattern: A Unicode string defining a pattern consistent 
284      with U{XML regular 
285      expressions<http://www.w3.org/TR/xmlschema-2/index.html#regexs>}. 
286   
287      @return: A Unicode string specifying a Python regular expression 
288      that matches the same language as C{pattern}.""" 
289      assert isinstance(pattern, six.text_type) 
290      new_pattern_elts = [] 
291      new_pattern_elts.append('^') 
292      position = 0 
293      while position < len(pattern): 
294          cg = MaybeMatchCharacterClass(pattern, position) 
295          if cg is None: 
296              ch = pattern[position] 
297              if ch == six.u('^') or ch == six.u('$'): 
298                   
299                   
300                   
301                  new_pattern_elts.append(six.unichr(0x5c) + ch) 
302              else: 
303                  new_pattern_elts.append(ch) 
304              position += 1 
305          else: 
306              (cps, position) = cg 
307              new_pattern_elts.append(cps.asPattern()) 
308      new_pattern_elts.append('$') 
309      return ''.join(new_pattern_elts) 
 310