Using the Pull Parser
This API is experimental, and subject to change.
parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
while parser.has_next?
res = parser.next
puts res[1]['att'] if res.start_tag? and res[0] == 'b'
end
See the PullEvent class for information on the content of the results. The data is identical to the arguments passed for the various events to the StreamListener API.
Notice that:
parser = PullParser.new( "<a>BAD DOCUMENT" )
while parser.has_next?
res = parser.next
raise res[1] if res.error?
end
Nat Price gave me some good ideas for the API.
- A
- E
- H
- N
- P
- S
- U
| LETTER | = | '[:alpha:]' |
| DIGIT | = | '[:digit:]' |
| COMBININGCHAR | = | '' |
| EXTENDER | = | '' |
| NCNAME_STR | = | "[#{LETTER}_:][-[:alnum:]._:#{COMBININGCHAR}#{EXTENDER}]*" |
| NAME_STR | = | "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})" |
| UNAME_STR | = | "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" |
| NAMECHAR | = | '[\-\w\.:]' |
| NAME | = | "([\\w:]#{NAMECHAR}*)" |
| NMTOKEN | = | "(?:#{NAMECHAR})+" |
| NMTOKENS | = | "#{NMTOKEN}(\\s+#{NMTOKEN})*" |
| REFERENCE | = | "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)" |
| REFERENCE_RE | = | /#{REFERENCE}/ |
| DOCTYPE_START | = | /\A\s*<!DOCTYPE\s/um |
| DOCTYPE_END | = | /\A\s*\]\s*>/um |
| DOCTYPE_PATTERN | = | /\s*<!DOCTYPE\s+(.*?)(\[|>)/um |
| ATTRIBUTE_PATTERN | = | /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um |
| COMMENT_START | = | /\A<!--/u |
| COMMENT_PATTERN | = | /<!--(.*?)-->/um |
| CDATA_START | = | /\A<!\[CDATA\[/u |
| CDATA_END | = | /\A\s*\]\s*>/um |
| CDATA_PATTERN | = | /<!\[CDATA\[(.*?)\]\]>/um |
| XMLDECL_START | = | /\A<\?xml\s/u; |
| XMLDECL_PATTERN | = | /<\?xml\s+(.*?)\?>/um |
| INSTRUCTION_START | = | /\A<\?/u |
| INSTRUCTION_PATTERN | = | /<\?(.*?)(\s+.*?)?\?>/um |
| TAG_MATCH | = | /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um |
| CLOSE_MATCH | = | /^\s*<\/(#{NAME_STR})\s*>/um |
| VERSION | = | /\bversion\s*=\s*["'](.*?)['"]/um |
| ENCODING | = | /\bencoding\s*=\s*["'](.*?)['"]/um |
| STANDALONE | = | /\bstandalone\s*=\s*["'](.*?)['"]/um |
| ENTITY_START | = | /\A\s*<!ENTITY/ |
| IDENTITY | = | /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u |
| ELEMENTDECL_START | = | /\A\s*<!ELEMENT/um |
| ELEMENTDECL_PATTERN | = | /\A\s*(<!ELEMENT.*?)>/um |
| SYSTEMENTITY | = | /\A\s*(%.*?;)\s*$/um |
| ENUMERATION | = | "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)" |
| NOTATIONTYPE | = | "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)" |
| ENUMERATEDTYPE | = | "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))" |
| ATTTYPE | = | "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})" |
| ATTVALUE | = | "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')" |
| DEFAULTDECL | = | "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))" |
| ATTDEF | = | "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}" |
| ATTDEF_RE | = | /#{ATTDEF}/ |
| ATTLISTDECL_START | = | /\A\s*<!ATTLIST/um |
| ATTLISTDECL_PATTERN | = | /\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um |
| NOTATIONDECL_START | = | /\A\s*<!NOTATION/um |
| PUBLIC | = | /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um |
| SYSTEM | = | /\A\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um |
| TEXT_PATTERN | = | /\A([^<]*)/um |
| PUBIDCHAR | = | "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" |
Entity constants |
||
| SYSTEMLITERAL | = | %Q{((?:"[^"]*")|(?:'[^']*'))} |
| PUBIDLITERAL | = | %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} |
| EXTERNALID | = | "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" |
| NDATADECL | = | "\\s+NDATA\\s+#{NAME}" |
| PEREFERENCE | = | "%#{NAME};" |
| ENTITYVALUE | = | %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} |
| PEDEF | = | "(?:#{ENTITYVALUE}|#{EXTERNALID})" |
| ENTITYDEF | = | "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" |
| PEDECL | = | "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" |
| GEDECL | = | "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" |
| ENTITYDECL | = | /\s*(?:#{GEDECL})|(?:#{PEDECL})/um |
| EREFERENCE | = | /&(?!#{NAME};)/ |
| DEFAULT_ENTITIES | = | { 'gt' => [/>/, '>', '>', />/], 'lt' => [/</, '<', '<', /</], 'quot' => [/"/, '"', '"', /"/], "apos" => [/'/, "'", "'", /'/] } |
| MISSING_ATTRIBUTE_QUOTES | = | /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um |
These are patterns to identify common markup errors, to make the error messages more informative. |
||
| [R] | source |
Source: show
# File lib/rexml/parsers/baseparser.rb, line 116 def initialize( source ) self.stream = source @listeners = [] end
Source: show
# File lib/rexml/parsers/baseparser.rb, line 121 def add_listener( listener ) @listeners << listener end
Returns true if there are no more events
Source: show
# File lib/rexml/parsers/baseparser.rb, line 147 def empty? return (@source.empty? and @stack.empty?) end
Source: show
# File lib/rexml/parsers/baseparser.rb, line 448 def entity( reference, entities ) value = nil value = entities[ reference ] if entities if not value value = DEFAULT_ENTITIES[ reference ] value = value[2] if value end unnormalize( value, entities ) if value end
Returns true if there are more events. Synonymous with !empty?
Source: show
# File lib/rexml/parsers/baseparser.rb, line 152 def has_next? return !(@source.empty? and @stack.empty?) end
Escapes all possible entities
Source: show
# File lib/rexml/parsers/baseparser.rb, line 459 def normalize( input, entities=nil, entity_filter=nil ) copy = input.clone # Doing it like this rather than in a loop improves the speed copy.gsub!( EREFERENCE, '&' ) entities.each do |key, value| copy.gsub!( value, "&#{key};" ) unless entity_filter and entity_filter.include?(entity) end if entities copy.gsub!( EREFERENCE, '&' ) DEFAULT_ENTITIES.each do |key, value| copy.gsub!( value[3], value[1] ) end copy end
Peek at the depth event in the stack. The first element on
the stack is at depth 0. If depth is -1, will parse to the
end of the input stream and return the last event, which is always
:end_document. Be aware that this causes the stream to be parsed up to the
depth event, so you can effectively pre-parse the entire
document (pull the entire thing into memory) using this method.
Source: show
# File lib/rexml/parsers/baseparser.rb, line 168 def peek depth=0 raise %Q[Illegal argument "#{depth}"] if depth < -1 temp = [] if depth == -1 temp.push(pull()) until empty? else while @stack.size+temp.size < depth+1 temp.push(pull()) end end @stack += temp if temp.size > 0 @stack[depth] end
Source: show
# File lib/rexml/parsers/baseparser.rb, line 137 def position if @source.respond_to? :position @source.position else # FIXME 0 end end
Returns the next event. This is a PullEvent object.
Source: show
# File lib/rexml/parsers/baseparser.rb, line 183 def pull pull_event.tap do |event| @listeners.each do |listener| listener.receive event end end end
Source: show
# File lib/rexml/parsers/baseparser.rb, line 127 def stream=( source ) @source = SourceFactory.create_from( source ) @closed = nil @document_status = nil @tags = [] @stack = [] @entities = [] @nsstack = [] end
Unescapes all possible entities
Source: show
# File lib/rexml/parsers/baseparser.rb, line 475 def unnormalize( string, entities=nil, filter=nil ) rv = string.clone rv.gsub!( /\r\n?/, "\n" ) matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { m=$1 m = "0#{m}" if m[0] == ?x [Integer(m)].pack('U*') } matches.collect!{|x|x[0]}.compact! if matches.size > 0 matches.each do |entity_reference| unless filter and filter.include?(entity_reference) entity_value = entity( entity_reference, entities ) if entity_value re = /&#{entity_reference};/ rv.gsub!( re, entity_value ) else er = DEFAULT_ENTITIES[entity_reference] rv.gsub!( er[0], er[2] ) if er end end end rv.gsub!( /&/, '&' ) end rv end