Class REXML::Parsers::BaseParser
In: lib/rexml/parsers/baseparser.rb
Parent: Object

Using the Pull Parser

This API is experimental, and subject to change.

 parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
 while parser.has_next?
   res = parser.next
   puts res[1]['att'] if res.start_tag? and res[0] == 'b'
 end

See the PullEvent class for information on the content of the results. The data is identical to the arguments passed for the various events to the StreamListener API.

Notice that:

 parser = PullParser.new( "<a>BAD DOCUMENT" )
 while parser.has_next?
   res = parser.next
   raise res[1] if res.error?
 end

Nat Price gave me some good ideas for the API.

Methods

add_listener   empty?   entity   has_next?   new   normalize   peek   position   pull   stream=   unnormalize   unshift  

Constants

NCNAME_STR = '[\w:][\-\w\d.]*'
NAME_STR = "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
NAMECHAR = '[\-\w\d\.:]'
NAME = "([\\w:]#{NAMECHAR}*)"
NMTOKEN = "(?:#{NAMECHAR})+"
NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"
REFERENCE_RE = /#{REFERENCE}/
DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um
COMMENT_START = /\A<!--/u
COMMENT_PATTERN = /<!--(.*?)-->/um
CDATA_START = /\A<!\[CDATA\[/u
CDATA_END = /^\s*\]\s*>/um
CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
XMLDECL_START = /\A<\?xml\s/u;
XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
INSTRUCTION_START = /\A<\?/u
INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um
TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{NAME_STR}\s*=\s*(["']).*?\3)*)\s*(\/)?>/um
CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um
VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
STANDALONE = /\bstandalone\s*=\s["'](.*?)['"]/um
ENTITY_START = /^\s*<!ENTITY/
IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'].*?['"])?(\s+['"].*?["'])?/u
ELEMENTDECL_START = /^\s*<!ELEMENT/um
ELEMENTDECL_PATTERN = /^\s*(<!ELEMENT.*?)>/um
SYSTEMENTITY = /^\s*(%.*?;)\s*$/um
ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
ATTDEF_RE = /#{ATTDEF}/
ATTLISTDECL_START = /^\s*<!ATTLIST/um
ATTLISTDECL_PATTERN = /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
NOTATIONDECL_START = /^\s*<!NOTATION/um
PUBLIC = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
SYSTEM = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
TEXT_PATTERN = /\A([^<]*)/um
PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"   Entity constants
SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))}
PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
NDATADECL = "\\s+NDATA\\s+#{NAME}"
PEREFERENCE = "%#{NAME};"
ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
EREFERENCE = /&(?!#{NAME};)/
DEFAULT_ENTITIES = { 'gt' => [/&gt;/, '&gt;', '>', />/], 'lt' => [/&lt;/, '&lt;', '<', /</], 'quot' => [/&quot;/, '&quot;', '"', /"/], "apos" => [/&apos;/, "&apos;", "'", /'/]
MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um   These are patterns to identify common markup errors, to make the error messages more informative.

Attributes

source  [R] 

Public Class methods

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 106
106:       def initialize( source )
107:         self.stream = source
108:       end

Public Instance methods

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 110
110:       def add_listener( listener )
111:         if !defined?(@listeners) or !@listeners
112:           @listeners = []
113:           instance_eval "alias :_old_pull :pull\ndef pull\nevent = _old_pull\n@listeners.each do |listener|\nlistener.receive event\nend\nevent\nend\n"
114:         end
115:         @listeners << listener
116:       end

Returns true if there are no more events

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 149
149:       def empty?
150:         return (@source.empty? and @stack.empty?)
151:       end

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 384
384:       def entity( reference, entities )
385:         value = nil
386:         value = entities[ reference ] if entities
387:         if not value
388:           value = DEFAULT_ENTITIES[ reference ]
389:           value = value[2] if value
390:         end
391:         unnormalize( value, entities ) if value
392:       end

Returns true if there are more events. Synonymous with !empty?

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 154
154:       def has_next?
155:         return !(@source.empty? and @stack.empty?)
156:       end

Escapes all possible entities

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 395
395:       def normalize( input, entities=nil, entity_filter=nil )
396:         copy = input.clone
397:         # Doing it like this rather than in a loop improves the speed
398:         copy.gsub!( EREFERENCE, '&amp;' )
399:         entities.each do |key, value|
400:           copy.gsub!( value, "&#{key};" ) unless entity_filter and 
401:                                       entity_filter.include?(entity)
402:         end if entities
403:         copy.gsub!( EREFERENCE, '&amp;' )
404:         DEFAULT_ENTITIES.each do |key, value|
405:           copy.gsub!( value[3], value[1] )
406:         end
407:         copy
408:       end

Peek at the depth event in the stack. The first element on the stack is at depth 0. If depth is -1, will parse to the end of the input stream and return the last event, which is always :end_document. Be aware that this causes the stream to be parsed up to the depth event, so you can effectively pre-parse the entire document (pull the entire thing into memory) using this method.

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 170
170:       def peek depth=0
171:         raise %Q[Illegal argument "#{depth}"] if depth < -1
172:         temp = []
173:         if depth == -1
174:           temp.push(pull()) until empty?
175:         else
176:           while @stack.size+temp.size < depth+1
177:             temp.push(pull())
178:           end
179:         end
180:         @stack += temp if temp.size > 0
181:         @stack[depth]
182:       end

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 139
139:       def position
140:         if @source.respond_to? :position
141:           @source.position
142:         else
143:           # FIXME
144:           0
145:         end
146:       end

Returns the next event. This is a PullEvent object.

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 185
185:       def pull
186:         if @closed
187:           x, @closed = @closed, nil
188:           return [ :end_element, x ]
189:         end
190:         return [ :end_document ] if empty?
191:         return @stack.shift if @stack.size > 0
192:         @source.read if @source.buffer.size<2
193:         #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
194:         if @document_status == nil
195:           #@source.consume( /^\s*/um )
196:           word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
197:           word = word[1] unless word.nil?
198:           #STDERR.puts "WORD = #{word.inspect}"
199:           case word
200:           when COMMENT_START
201:             return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
202:           when XMLDECL_START
203:             #STDERR.puts "XMLDECL"
204:             results = @source.match( XMLDECL_PATTERN, true )[1]
205:             version = VERSION.match( results )
206:             version = version[1] unless version.nil?
207:             encoding = ENCODING.match(results)
208:             encoding = encoding[1] unless encoding.nil?
209:             @source.encoding = encoding
210:             standalone = STANDALONE.match(results)
211:             standalone = standalone[1] unless standalone.nil?
212:             return [ :xmldecl, version, encoding, standalone ]
213:           when INSTRUCTION_START
214:             return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
215:           when DOCTYPE_START
216:             md = @source.match( DOCTYPE_PATTERN, true )
217:             identity = md[1]
218:             close = md[2]
219:             identity =~ IDENTITY
220:             name = $1
221:             raise REXML::ParseException("DOCTYPE is missing a name") if name.nil?
222:             pub_sys = $2.nil? ? nil : $2.strip
223:             long_name = $3.nil? ? nil : $3.strip
224:             uri = $4.nil? ? nil : $4.strip
225:             args = [ :start_doctype, name, pub_sys, long_name, uri ]
226:             if close == ">"
227:               @document_status = :after_doctype
228:               @source.read if @source.buffer.size<2
229:               md = @source.match(/^\s*/um, true)
230:               @stack << [ :end_doctype ]
231:             else
232:               @document_status = :in_doctype
233:             end
234:             return args
235:           when /^\s+/
236:           else
237:             @document_status = :after_doctype
238:             @source.read if @source.buffer.size<2
239:             md = @source.match(/\s*/um, true)
240:           end
241:         end
242:         if @document_status == :in_doctype
243:           md = @source.match(/\s*(.*?>)/um)
244:           case md[1]
245:           when SYSTEMENTITY 
246:             match = @source.match( SYSTEMENTITY, true )[1]
247:             return [ :externalentity, match ]
248: 
249:           when ELEMENTDECL_START
250:             return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
251: 
252:           when ENTITY_START
253:             match = @source.match( ENTITYDECL, true ).to_a.compact
254:             match[0] = :entitydecl
255:             ref = false
256:             if match[1] == '%'
257:               ref = true
258:               match.delete_at 1
259:             end
260:             # Now we have to sort out what kind of entity reference this is
261:             if match[2] == 'SYSTEM'
262:               # External reference
263:               match[3] = match[3][1..-2] # PUBID
264:               match.delete_at(4) if match.size > 4 # Chop out NDATA decl
265:               # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
266:             elsif match[2] == 'PUBLIC'
267:               # External reference
268:               match[3] = match[3][1..-2] # PUBID
269:               match[4] = match[4][1..-2] # HREF
270:               # match is [ :entity, name, PUBLIC, pubid, href ]
271:             else
272:               match[2] = match[2][1..-2]
273:               match.pop if match.size == 4
274:               # match is [ :entity, name, value ]
275:             end
276:             match << '%' if ref
277:             return match
278:           when ATTLISTDECL_START
279:             md = @source.match( ATTLISTDECL_PATTERN, true )
280:             raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
281:             element = md[1]
282:             contents = md[0]
283: 
284:             pairs = {}
285:             values = md[0].scan( ATTDEF_RE )
286:             values.each do |attdef|
287:               unless attdef[3] == "#IMPLIED"
288:                 attdef.compact!
289:                 val = attdef[3]
290:                 val = attdef[4] if val == "#FIXED "
291:                 pairs[attdef[0]] = val
292:               end
293:             end
294:             return [ :attlistdecl, element, pairs, contents ]
295:           when NOTATIONDECL_START
296:             md = nil
297:             if @source.match( PUBLIC )
298:               md = @source.match( PUBLIC, true )
299:               vals = [md[1],md[2],md[4],md[6]]
300:             elsif @source.match( SYSTEM )
301:               md = @source.match( SYSTEM, true )
302:               vals = [md[1],md[2],nil,md[4]]
303:             else
304:               raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
305:             end
306:             return [ :notationdecl, *vals ]
307:           when CDATA_END
308:             @document_status = :after_doctype
309:             @source.match( CDATA_END, true )
310:             return [ :end_doctype ]
311:           end
312:         end
313:         begin
314:           if @source.buffer[0] == ?<
315:             if @source.buffer[1] == ?/
316:               last_tag = @tags.pop
317:               #md = @source.match_to_consume( '>', CLOSE_MATCH)
318:               md = @source.match( CLOSE_MATCH, true )
319:               raise REXML::ParseException.new( "Missing end tag for "+
320:                 "'#{last_tag}' (got \"#{md[1]}\")", 
321:                 @source) unless last_tag == md[1]
322:               return [ :end_element, last_tag ]
323:             elsif @source.buffer[1] == ?!
324:               md = @source.match(/\A(\s*[^>]*>)/um)
325:               #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
326:               raise REXML::ParseException.new("Malformed node", @source) unless md
327:               if md[0][2] == ?-
328:                 md = @source.match( COMMENT_PATTERN, true )
329:                 return [ :comment, md[1] ] if md
330:               else
331:                 md = @source.match( CDATA_PATTERN, true )
332:                 return [ :cdata, md[1] ] if md
333:               end
334:               raise REXML::ParseException.new( "Declarations can only occur "+
335:                 "in the doctype declaration.", @source)
336:             elsif @source.buffer[1] == ??
337:               md = @source.match( INSTRUCTION_PATTERN, true )
338:               return [ :processing_instruction, md[1], md[2] ] if md
339:               raise REXML::ParseException.new( "Bad instruction declaration",
340:                 @source)
341:             else
342:               # Get the next tag
343:               md = @source.match(TAG_MATCH, true)
344:               unless md
345:                 # Check for missing attribute quotes
346:                 raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
347:                 raise REXML::ParseException.new("malformed XML: missing tag start", @source) 
348:               end
349:               attrs = []
350:               if md[2].size > 0
351:                 attrs = md[2].scan( ATTRIBUTE_PATTERN )
352:                 raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
353:               end
354:         
355:               if md[4]
356:                 @closed = md[1]
357:               else
358:                 @tags.push( md[1] )
359:               end
360:               attributes = {}
361:               attrs.each { |a,b,c| attributes[a] = c }
362:               return [ :start_element, md[1], attributes ]
363:             end
364:           else
365:             md = @source.match( TEXT_PATTERN, true )
366:             if md[0].length == 0
367:               @source.match( /(\s+)/, true )
368:             end
369:             #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
370:             #return [ :text, "" ] if md[0].length == 0
371:             # unnormalized = Text::unnormalize( md[1], self )
372:             # return PullEvent.new( :text, md[1], unnormalized )
373:             return [ :text, md[1] ]
374:           end
375:         rescue REXML::ParseException
376:           raise
377:         rescue Exception, NameError => error
378:           raise REXML::ParseException.new( "Exception parsing",
379:             @source, self, (error ? error : $!) )
380:         end
381:         return [ :dummy ]
382:       end

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 130
130:       def stream=( source )
131:         @source = SourceFactory.create_from( source )
132:         @closed = nil
133:         @document_status = nil
134:         @tags = []
135:         @stack = []
136:         @entities = []
137:       end

Unescapes all possible entities

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 411
411:       def unnormalize( string, entities=nil, filter=nil )
412:         rv = string.clone
413:         rv.gsub!( /\r\n?/, "\n" )
414:         matches = rv.scan( REFERENCE_RE )
415:         return rv if matches.size == 0
416:         rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m|
417:           m=$1
418:           m = "0#{m}" if m[0] == ?x
419:           [Integer(m)].pack('U*')
420:         }
421:         matches.collect!{|x|x[0]}.compact!
422:         if matches.size > 0
423:           matches.each do |entity_reference|
424:             unless filter and filter.include?(entity_reference)
425:               entity_value = entity( entity_reference, entities )
426:               if entity_value
427:                 re = /&#{entity_reference};/
428:                 rv.gsub!( re, entity_value )
429:               end
430:             end
431:           end
432:           matches.each do |entity_reference|
433:             unless filter and filter.include?(entity_reference)
434:               er = DEFAULT_ENTITIES[entity_reference]
435:               rv.gsub!( er[0], er[2] ) if er
436:             end
437:           end
438:           rv.gsub!( /&amp;/, '&' )
439:         end
440:         rv
441:       end

Push an event back on the head of the stream. This method has (theoretically) infinite depth.

[Source]

     # File lib/rexml/parsers/baseparser.rb, line 160
160:       def unshift token
161:         @stack.unshift(token)
162:       end

[Validate]