| Class | REXML::Parsers::BaseParser |
| In: |
lib/rexml/parsers/baseparser.rb
|
| Parent: | Object |
This API is experimental, and subject to change.
parser = PullParser.new( "<a>text<b att='val'/>txet</a>" ) while parser.has_next? res = parser.next puts res[1]['att'] if res.start_tag? and res[0] == 'b' end
See the PullEvent class for information on the content of the results. The data is identical to the arguments passed for the various events to the StreamListener API.
Notice that:
parser = PullParser.new( "<a>BAD DOCUMENT" ) while parser.has_next? res = parser.next raise res[1] if res.error? end
Nat Price gave me some good ideas for the API.
| NCNAME_STR | = | '[\w:][\-\w\d.]*' | ||
| NAME_STR | = | "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" | ||
| NAMECHAR | = | '[\-\w\d\.:]' | ||
| NAME | = | "([\\w:]#{NAMECHAR}*)" | ||
| NMTOKEN | = | "(?:#{NAMECHAR})+" | ||
| NMTOKENS | = | "#{NMTOKEN}(\\s+#{NMTOKEN})*" | ||
| REFERENCE | = | "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)" | ||
| REFERENCE_RE | = | /#{REFERENCE}/ | ||
| DOCTYPE_START | = | /\A\s*<!DOCTYPE\s/um | ||
| DOCTYPE_PATTERN | = | /\s*<!DOCTYPE\s+(.*?)(\[|>)/um | ||
| ATTRIBUTE_PATTERN | = | /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um | ||
| COMMENT_START | = | /\A<!--/u | ||
| COMMENT_PATTERN | = | /<!--(.*?)-->/um | ||
| CDATA_START | = | /\A<!\[CDATA\[/u | ||
| CDATA_END | = | /^\s*\]\s*>/um | ||
| CDATA_PATTERN | = | /<!\[CDATA\[(.*?)\]\]>/um | ||
| XMLDECL_START | = | /\A<\?xml\s/u; | ||
| XMLDECL_PATTERN | = | /<\?xml\s+(.*?)\?>/um | ||
| INSTRUCTION_START | = | /\A<\?/u | ||
| INSTRUCTION_PATTERN | = | /<\?(.*?)(\s+.*?)?\?>/um | ||
| TAG_MATCH | = | /^<((?>#{NAME_STR}))\s*((?>\s+#{NAME_STR}\s*=\s*(["']).*?\3)*)\s*(\/)?>/um | ||
| CLOSE_MATCH | = | /^\s*<\/(#{NAME_STR})\s*>/um | ||
| VERSION | = | /\bversion\s*=\s*["'](.*?)['"]/um | ||
| ENCODING | = | /\bencoding=["'](.*?)['"]/um | ||
| STANDALONE | = | /\bstandalone=["'](.*?)['"]/um | ||
| ENTITY_START | = | /^\s*<!ENTITY/ | ||
| IDENTITY | = | /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'].*?['"])?(\s+['"].*?["'])?/u | ||
| ELEMENTDECL_START | = | /^\s*<!ELEMENT/um | ||
| ELEMENTDECL_PATTERN | = | /^\s*(<!ELEMENT.*?)>/um | ||
| SYSTEMENTITY | = | /^\s*(%.*?;)\s*$/um | ||
| ENUMERATION | = | "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)" | ||
| NOTATIONTYPE | = | "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)" | ||
| ENUMERATEDTYPE | = | "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))" | ||
| ATTTYPE | = | "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})" | ||
| ATTVALUE | = | "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')" | ||
| DEFAULTDECL | = | "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))" | ||
| ATTDEF | = | "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}" | ||
| ATTDEF_RE | = | /#{ATTDEF}/ | ||
| ATTLISTDECL_START | = | /^\s*<!ATTLIST/um | ||
| ATTLISTDECL_PATTERN | = | /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um | ||
| NOTATIONDECL_START | = | /^\s*<!NOTATION/um | ||
| PUBLIC | = | /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um | ||
| SYSTEM | = | /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um | ||
| TEXT_PATTERN | = | /\A([^<]*)/um | ||
| PUBIDCHAR | = | "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" | Entity constants | |
| SYSTEMLITERAL | = | %Q{((?:"[^"]*")|(?:'[^']*'))} | ||
| PUBIDLITERAL | = | %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} | ||
| EXTERNALID | = | "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" | ||
| NDATADECL | = | "\\s+NDATA\\s+#{NAME}" | ||
| PEREFERENCE | = | "%#{NAME};" | ||
| ENTITYVALUE | = | %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} | ||
| PEDEF | = | "(?:#{ENTITYVALUE}|#{EXTERNALID})" | ||
| ENTITYDEF | = | "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" | ||
| PEDECL | = | "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" | ||
| GEDECL | = | "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" | ||
| ENTITYDECL | = | /\s*(?:#{GEDECL})|(?:#{PEDECL})/um | ||
| EREFERENCE | = | /&(?!#{NAME};)/ | ||
| DEFAULT_ENTITIES | = | { 'gt' => [/>/, '>', '>', />/], 'lt' => [/</, '<', '<', /</], 'quot' => [/"/, '"', '"', /"/], "apos" => [/'/, "'", "'", /'/] |
| source | [R] |
# File lib/rexml/parsers/baseparser.rb, line 99
99: def initialize( source )
100: self.stream = source
101: end
# File lib/rexml/parsers/baseparser.rb, line 103
103: def add_listener( listener )
104: if !defined?(@listeners) or !@listeners
105: @listeners = []
106: instance_eval "alias :_old_pull :pull\ndef pull\nevent = _old_pull\n@listeners.each do |listener|\nlistener.receive event\nend\nevent\nend\n"
107: end
108: @listeners << listener
109: end
Returns true if there are no more events
# File lib/rexml/parsers/baseparser.rb, line 142
142: def empty?
143: #STDERR.puts "@source.empty? = #{@source.empty?}"
144: #STDERR.puts "@stack.empty? = #{@stack.empty?}"
145: return (@source.empty? and @stack.empty?)
146: end
# File lib/rexml/parsers/baseparser.rb, line 377
377: def entity( reference, entities )
378: value = nil
379: value = entities[ reference ] if entities
380: if not value
381: value = DEFAULT_ENTITIES[ reference ]
382: value = value[2] if value
383: end
384: unnormalize( value, entities ) if value
385: end
Escapes all possible entities
# File lib/rexml/parsers/baseparser.rb, line 388
388: def normalize( input, entities=nil, entity_filter=nil )
389: copy = input.clone
390: # Doing it like this rather than in a loop improves the speed
391: copy.gsub!( EREFERENCE, '&' )
392: entities.each do |key, value|
393: copy.gsub!( value, "&#{key};" ) unless entity_filter and
394: entity_filter.include?(entity)
395: end if entities
396: copy.gsub!( EREFERENCE, '&' )
397: DEFAULT_ENTITIES.each do |key, value|
398: copy.gsub!( value[3], value[1] )
399: end
400: copy
401: end
Peek at the depth event in the stack. The first element on the stack is at depth 0. If depth is -1, will parse to the end of the input stream and return the last event, which is always :end_document. Be aware that this causes the stream to be parsed up to the depth event, so you can effectively pre-parse the entire document (pull the entire thing into memory) using this method.
# File lib/rexml/parsers/baseparser.rb, line 165
165: def peek depth=0
166: raise %Q[Illegal argument "#{depth}"] if depth < -1
167: temp = []
168: if depth == -1
169: temp.push(pull()) until empty?
170: else
171: while @stack.size+temp.size < depth+1
172: temp.push(pull())
173: end
174: end
175: @stack += temp if temp.size > 0
176: @stack[depth]
177: end
# File lib/rexml/parsers/baseparser.rb, line 132
132: def position
133: if @source.respond_to? :position
134: @source.position
135: else
136: # FIXME
137: 0
138: end
139: end
Returns the next event. This is a PullEvent object.
# File lib/rexml/parsers/baseparser.rb, line 180
180: def pull
181: if @closed
182: x, @closed = @closed, nil
183: return [ :end_element, x ]
184: end
185: return [ :end_document ] if empty?
186: return @stack.shift if @stack.size > 0
187: @source.read if @source.buffer.size<2
188: #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
189: if @document_status == nil
190: #@source.consume( /^\s*/um )
191: word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
192: word = word[1] unless word.nil?
193: #STDERR.puts "WORD = #{word.inspect}"
194: case word
195: when COMMENT_START
196: return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
197: when XMLDECL_START
198: #STDERR.puts "XMLDECL"
199: results = @source.match( XMLDECL_PATTERN, true )[1]
200: version = VERSION.match( results )
201: version = version[1] unless version.nil?
202: encoding = ENCODING.match(results)
203: encoding = encoding[1] unless encoding.nil?
204: @source.encoding = encoding
205: standalone = STANDALONE.match(results)
206: standalone = standalone[1] unless standalone.nil?
207: return [ :xmldecl, version, encoding, standalone ]
208: when INSTRUCTION_START
209: return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
210: when DOCTYPE_START
211: md = @source.match( DOCTYPE_PATTERN, true )
212: identity = md[1]
213: close = md[2]
214: identity =~ IDENTITY
215: name = $1
216: raise REXML::ParseException("DOCTYPE is missing a name") if name.nil?
217: pub_sys = $2.nil? ? nil : $2.strip
218: long_name = $3.nil? ? nil : $3.strip
219: uri = $4.nil? ? nil : $4.strip
220: args = [ :start_doctype, name, pub_sys, long_name, uri ]
221: if close == ">"
222: @document_status = :after_doctype
223: @source.read if @source.buffer.size<2
224: md = @source.match(/^\s*/um, true)
225: @stack << [ :end_doctype ]
226: else
227: @document_status = :in_doctype
228: end
229: return args
230: when /^\s+/
231: else
232: @document_status = :after_doctype
233: @source.read if @source.buffer.size<2
234: md = @source.match(/\s*/um, true)
235: end
236: end
237: if @document_status == :in_doctype
238: md = @source.match(/\s*(.*?>)/um)
239: case md[1]
240: when SYSTEMENTITY
241: match = @source.match( SYSTEMENTITY, true )[1]
242: return [ :externalentity, match ]
243:
244: when ELEMENTDECL_START
245: return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
246:
247: when ENTITY_START
248: match = @source.match( ENTITYDECL, true ).to_a.compact
249: match[0] = :entitydecl
250: ref = false
251: if match[1] == '%'
252: ref = true
253: match.delete_at 1
254: end
255: # Now we have to sort out what kind of entity reference this is
256: if match[2] == 'SYSTEM'
257: # External reference
258: match[3] = match[3][1..-2] # PUBID
259: match.delete_at(4) if match.size > 4 # Chop out NDATA decl
260: # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
261: elsif match[2] == 'PUBLIC'
262: # External reference
263: match[3] = match[3][1..-2] # PUBID
264: match[4] = match[4][1..-2] # HREF
265: # match is [ :entity, name, PUBLIC, pubid, href ]
266: else
267: match[2] = match[2][1..-2]
268: match.pop if match.size == 4
269: # match is [ :entity, name, value ]
270: end
271: match << '%' if ref
272: return match
273: when ATTLISTDECL_START
274: md = @source.match( ATTLISTDECL_PATTERN, true )
275: raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
276: element = md[1]
277: contents = md[0]
278:
279: pairs = {}
280: values = md[0].scan( ATTDEF_RE )
281: values.each do |attdef|
282: unless attdef[3] == "#IMPLIED"
283: attdef.compact!
284: val = attdef[3]
285: val = attdef[4] if val == "#FIXED "
286: pairs[attdef[0]] = val
287: end
288: end
289: return [ :attlistdecl, element, pairs, contents ]
290: when NOTATIONDECL_START
291: md = nil
292: if @source.match( PUBLIC )
293: md = @source.match( PUBLIC, true )
294: vals = [md[1],md[2],md[4],md[6]]
295: elsif @source.match( SYSTEM )
296: md = @source.match( SYSTEM, true )
297: vals = [md[1],md[2],nil,md[4]]
298: else
299: raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
300: end
301: return [ :notationdecl, *vals ]
302: when CDATA_END
303: @document_status = :after_doctype
304: @source.match( CDATA_END, true )
305: return [ :end_doctype ]
306: end
307: end
308: begin
309: if @source.buffer[0] == ?<
310: if @source.buffer[1] == ?/
311: last_tag = @tags.pop
312: #md = @source.match_to_consume( '>', CLOSE_MATCH)
313: md = @source.match( CLOSE_MATCH, true )
314: raise REXML::ParseException.new( "Missing end tag for "+
315: "'#{last_tag}' (got \"#{md[1]}\")",
316: @source) unless last_tag == md[1]
317: return [ :end_element, last_tag ]
318: elsif @source.buffer[1] == ?!
319: md = @source.match(/\A(\s*[^>]*>)/um)
320: #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
321: raise REXML::ParseException.new("Malformed node", @source) unless md
322: if md[0][2] == ?-
323: md = @source.match( COMMENT_PATTERN, true )
324: return [ :comment, md[1] ] if md
325: else
326: md = @source.match( CDATA_PATTERN, true )
327: return [ :cdata, md[1] ] if md
328: end
329: raise REXML::ParseException.new( "Declarations can only occur "+
330: "in the doctype declaration.", @source)
331: elsif @source.buffer[1] == ??
332: md = @source.match( INSTRUCTION_PATTERN, true )
333: return [ :processing_instruction, md[1], md[2] ] if md
334: raise REXML::ParseException.new( "Bad instruction declaration",
335: @source)
336: else
337: # Get the next tag
338: md = @source.match(TAG_MATCH, true)
339: raise REXML::ParseException.new("malformed XML: missing tag start", @source) unless md
340: attrs = []
341: if md[2].size > 0
342: attrs = md[2].scan( ATTRIBUTE_PATTERN )
343: raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
344: end
345:
346: if md[4]
347: @closed = md[1]
348: else
349: @tags.push( md[1] )
350: end
351: attributes = {}
352: attrs.each { |a,b,c| attributes[a] = c }
353: return [ :start_element, md[1], attributes ]
354: end
355: else
356: md = @source.match( TEXT_PATTERN, true )
357: if md[0].length == 0
358: puts "EMPTY = #{empty?}"
359: puts "BUFFER = \"#{@source.buffer}\""
360: @source.match( /(\s+)/, true )
361: end
362: #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
363: #return [ :text, "" ] if md[0].length == 0
364: # unnormalized = Text::unnormalize( md[1], self )
365: # return PullEvent.new( :text, md[1], unnormalized )
366: return [ :text, md[1] ]
367: end
368: rescue REXML::ParseException
369: raise
370: rescue Exception, NameError => error
371: raise REXML::ParseException.new( "Exception parsing",
372: @source, self, (error ? error : $!) )
373: end
374: return [ :dummy ]
375: end
# File lib/rexml/parsers/baseparser.rb, line 123
123: def stream=( source )
124: @source = SourceFactory.create_from( source )
125: @closed = nil
126: @document_status = nil
127: @tags = []
128: @stack = []
129: @entities = []
130: end
Unescapes all possible entities
# File lib/rexml/parsers/baseparser.rb, line 404
404: def unnormalize( string, entities=nil, filter=nil )
405: rv = string.clone
406: rv.gsub!( /\r\n?/, "\n" )
407: matches = rv.scan( REFERENCE_RE )
408: return rv if matches.size == 0
409: rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m|
410: m=$1
411: m = "0#{m}" if m[0] == ?x
412: [Integer(m)].pack('U*')
413: }
414: matches.collect!{|x|x[0]}.compact!
415: if matches.size > 0
416: matches.each do |entity_reference|
417: unless filter and filter.include?(entity_reference)
418: entity_value = entity( entity_reference, entities )
419: if entity_value
420: re = /&#{entity_reference};/
421: rv.gsub!( re, entity_value )
422: end
423: end
424: end
425: matches.each do |entity_reference|
426: unless filter and filter.include?(entity_reference)
427: er = DEFAULT_ENTITIES[entity_reference]
428: rv.gsub!( er[0], er[2] ) if er
429: end
430: end
431: rv.gsub!( /&/, '&' )
432: end
433: rv
434: end