Fandom Developers Wiki
Advertisement
Documentation icon Module documentation

The documentation for this module is missing. Click here to create it.

-- https://raw.githubusercontent.com/manoelcampos/LuaXML/master/LuaXML/xml.lua
-- https://github.com/manoelcampos/LuaXML/blob/master/LuaXML/COPYRIGHT
---
--  Overview:
--  =========
--
--      This module provides a non-validating XML stream parser in Lua. 
--  
--  Features:
--  =========
--  
--      * Tokenises well-formed XML (relatively robustly)
--      * Flexible handler based event API (see below)
--      * Parses all XML Infoset elements - ie.
--          - Tags
--          - Text
--          - Comments
--          - CDATA
--          - XML Decl
--          - Processing Instructions
--          - DOCTYPE declarations
--      * Provides limited well-formedness checking 
--        (checks for basic syntax & balanced tags only)
--      * Flexible whitespace handling (selectable)
--      * Entity Handling (selectable)
--  
--  Limitations:
--  ============
--  
--      * Non-validating
--      * No charset handling 
--      * No namespace support 
--      * Shallow well-formedness checking only (fails
--        to detect most semantic errors)
--  
--  API:
--  ====
--
--  The parser provides a partially object-oriented API with 
--  functionality split into tokeniser and handler components.
--  
--  The handler instance is passed to the tokeniser and receives
--  callbacks for each XML element processed (if a suitable handler
--  function is defined). The API is conceptually similar to the 
--  SAX API but implemented differently.
--  
--  The following events are generated by the tokeniser
--  
--      handler:start       - Start Tag
--      handler:end         - End Tag
--      handler:text        - Text
--      handler:decl        - XML Declaration
--      handler:pi          - Processing Instruction
--      handler:comment     - Comment
--      handler:dtd         - DOCTYPE definition
--      handler:cdata       - CDATA 
--  
--  The function prototype for all the callback functions is
--      
--      callback(val,attrs,start,end)
--  
--  where attrs is a table and val/attrs are overloaded for 
--  specific callbacks - ie.
--  
--      Callback     val            attrs (table)
--      --------     ---            -------------
--      start        name           { attributes (name=val).. }
--      end          name           nil
--      text         <text>         nil
--      cdata        <text>         nil
--      decl         "xml"          { attributes (name=val).. }
--      pi           pi name        { attributes (if present)..
--                                    _text = <PI Text>
--                                  }
--      comment      <text>         nil     
--      dtd          root element   { _root = <Root Element>,
--                                    _type = SYSTEM|PUBLIC,
--                                    _name = <name>,
--                                    _uri = <uri>,
--                                    _internal = <internal dtd>
--                                  }
--
--  (start & end provide the character positions of the start/end
--  of the element)
--
--  XML data is passed to the parser instance through the 'parse'
--  method (Note: must be passed a single string currently)
--
--  Options
--  =======
--
--  Parser options are controlled through the 'self.options' table.
--  Available options are -
--  
--      * stripWS   
--      
--        Strip non-significant whitespace (leading/trailing) 
--        and do not generate events for empty text elements
--  
--      * expandEntities 
--  
--        Expand entities (standard entities + single char 
--        numeric entities only currently - could be extended 
--        at runtime if suitable DTD parser added elements
--        to table (see obj._ENTITIES). May also be possible
--        to expand multibyre entities for UTF-8 only
--  
--      * errorHandler
--  
--        Custom error handler function 
--
--  NOTE: Boolean options must be set to 'nil' not '0'
--  
--  Usage
--  =====
--
--  Create a handler instance - 
--  
--      h = { start = function(t,a,s,e) .... end,
--            end = function(t,a,s,e) .... end,
--            text = function(t,a,s,e) .... end,
--            cdata = text }
--  
--  (or use predefined handler - see handler.lua)
--  
--  Create parser instance - 
--  
--      p = xmlParser(h)
--  
--  Set options -
--
--      p.options.xxxx = nil
--
--  Parse XML data -
--  
--      xmlParser:parse("<?xml... ")
--  License:
--  ========
--
--      This code is freely distributable under the terms of the Lua license
--      (http://www.lua.org/copyright.html)
--
--  History
--  =======
--  Added parameter parseAttributes (boolean) in xmlParser.parse method
--  If true (default value), tag attributtes are parsed. 
--  by Manoel Campos da Silva Filho
--  http://manoelcampos.com
--  http://about.me/manoelcampos

--
--  $Id: xml.lua,v 1.1.1.1 2001/11/28 06:11:33 paulc Exp $
--
--  $Log: xml.lua,v $
--  Revision 1.1.1.1  2001/11/28 06:11:33  paulc
--  Initial Import
--
--@author Paul Chakravarti (paulc@passtheaardvark.com)<p/>


---Parses a XML string
--@param handler Handler object to be used to convert the XML string
--to another formats. @see handler.lua

local p = {}

p.xmlParser = function(handler)     
    local obj = {}
    -- Public attributes 

    obj.options = { 
                    stripWS = 1, 
                    expandEntities = 1,
                    errorHandler = function(err,pos) 
                                       error(string.format("%s [char=%d]\n",
                                               err or "Parse Error",pos))
                                   end,
                  }

    -- Public methods

    obj.parse = function(self, str, parseAttributes)
	    if parseAttributes == nil then
	       parseAttributes = true
	    end
	    self._handler.parseAttributes = parseAttributes
    
        local match,endmatch,pos = 0,0,1
        local text,endt1,endt2,tagstr,tagname,attrs,starttext,endtext
        local errstart,errend,extstart,extend
        while match do
            -- Get next tag (first pass - fix exceptions below)
            match,endmatch,text,endt1,tagstr,endt2 = string.find(str,self._XML,pos) 
            if not match then 
                if string.find(str, self._WS,pos) then
                    -- No more text - check document complete
                    if #self._stack ~= 0 then
                        self:_err(self._errstr.incompleteXmlErr,pos)
                    else
                        break 
                    end
                else
                    -- Unparsable text
                    self:_err(self._errstr.xmlErr,pos)
                end
            end 
            -- Handle leading text
            starttext = match
            endtext = match + string.len(text) - 1
            match = match + string.len(text)
            text = self:_parseEntities(self:_stripWS(text))
            if text ~= "" and self._handler.text then
                self._handler:text(text,nil,match,endtext)
            end
            -- Test for tag type
            if string.find(string.sub(tagstr,1,5),"?xml%s") then
                -- XML Declaration
                match,endmatch,text = string.find(str,self._PI,pos)
                if not match then 
                    self:_err(self._errstr.declErr,pos)
                end 
                if match ~= 1 then
                    -- Must be at start of doc if present
                    self:_err(self._errstr.declStartErr,pos)
                end
                tagname,attrs = self:_parseTag(text) 
                -- TODO: Check attributes are valid
                -- Check for version (mandatory)
                if attrs.version == nil then
                    self:_err(self._errstr.declAttrErr,pos)
                end
                if self._handler.decl then 
                    self._handler:decl(tagname,attrs,match,endmatch) 
                end
            elseif string.sub(tagstr,1,1) == "?" then
                -- Processing Instruction
                match,endmatch,text = string.find(str,self._PI,pos)
                if not match then 
                    self:_err(self._errstr.piErr,pos)
                end 
                if self._handler.pi then 
                    -- Parse PI attributes & text
                    tagname,attrs = self:_parseTag(text) 
                    local pi = string.sub(text,string.len(tagname)+1)
                    if pi ~= "" then
                        if attrs then
                            attrs._text = pi
                        else
                            attrs = { _text = pi }
                        end
                    end
                    self._handler:pi(tagname,attrs,match,endmatch) 
                end
            elseif string.sub(tagstr,1,3) == "!--" then
                -- Comment
                match,endmatch,text = string.find(str,self._COMMENT,pos)
                if not match then 
                    self:_err(self._errstr.commentErr,pos)
                end 
                if self._handler.comment then 
                    text = self:_parseEntities(self:_stripWS(text))
                    self._handler:comment(text,next,match,endmatch)
                end
            elseif string.sub(tagstr,1,8) == "!DOCTYPE" then
                -- DTD
                match,endmatch,attrs = self:_parseDTD(string,pos)
                if not match then 
                    self:_err(self._errstr.dtdErr,pos)
                end 
                if self._handler.dtd then
                    self._handler:dtd(attrs._root,attrs,match,endmatch)
                end
            elseif string.sub(tagstr,1,8) == "![CDATA[" then
                -- CDATA
                match,endmatch,text = string.find(str,self._CDATA,pos)
                if not match then 
                    self:_err(self._errstr.cdataErr,pos)
                end 
                if self._handler.cdata then
                    self._handler:cdata(text,nil,match,endmatch)
                end
            else
                -- Normal tag

                -- Need check for embedded '>' in attribute value and extend
                -- match recursively if necessary eg. <tag attr="123>456"> 

                while 1 do
                    errstart,errend = string.find(tagstr,self._ATTRERR1)
                    if errend == nil then
                        errstart,errend = string.find(tagstr,self._ATTRERR2)
                        if errend == nil then
                            break
                        end
                    end
                    extstart,extend,endt2 = string.find(str,self._TAGEXT,endmatch+1)
                    tagstr = tagstr .. string.sub(string,endmatch,extend-1)
                    if not match then 
                        self:_err(self._errstr.xmlErr,pos)
                    end 
                    endmatch = extend
                end 

                -- Extract tagname/attrs
                
                tagname,attrs = self:_parseTag(tagstr) 

                if (endt1=="/") then
                    -- End tag
                    if self._handler.endtag then
                        if attrs then
                            -- Shouldnt have any attributes in endtag
                            self:_err(string.format("%s (/%s)",
                                             self._errstr.endTagErr,
                                             tagname)
                                        ,pos)
                        end
                        if table.remove(self._stack) ~= tagname then
                            self:_err(string.format("%s (/%s)",
                                             self._errstr.unmatchedTagErr,
                                             tagname)
                                        ,pos)
                        end
                        self._handler:endtag(tagname,nil,match,endmatch)
                    end
                else
                    -- Start Tag
                    table.insert(self._stack,tagname)
                    if self._handler.starttag then
                        self._handler:starttag(tagname,attrs,match,endmatch)
                    end
                    --TODO: Tags com fechamento automático estão sendo
                    --retornadas como uma tabela, o que complica
                    --para a app NCLua tratar isso. É preciso
                    --fazer com que seja retornado um campo string vazio.
                    -- Self-Closing Tag
                    if (endt2=="/") then
                        table.remove(self._stack)
                        if self._handler.endtag then
                            self._handler:endtag(tagname,nil,match,endmatch)
                        end
                    end
                end
            end
            pos = endmatch + 1
        end
    end

    -- Private attribures/functions

    obj._handler    = handler
    obj._stack      = {}

    obj._XML        = '^([^<]*)<(%/?)([^>]-)(%/?)>'
    obj._ATTR1      = '([%w-:_]+)%s*=%s*"(.-)"'
    obj._ATTR2      = '([%w-:_]+)%s*=%s*\'(.-)\''
    obj._CDATA      = '<%!%[CDATA%[(.-)%]%]>'
    obj._PI         = '<%?(.-)%?>'
    obj._COMMENT    = '<!%-%-(.-)%-%->'
    obj._TAG        = '^(.-)%s.*'
    obj._LEADINGWS  = '^%s+'
    obj._TRAILINGWS = '%s+$'
    obj._WS         = '^%s*$'
    obj._DTD1       = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*(%b[])%s*>'
    obj._DTD2       = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*(%b[])%s*>'
    obj._DTD3       = '<!DOCTYPE%s+(.-)%s*(%b[])%s*>'
    obj._DTD4       = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*>'
    obj._DTD5       = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*>'

    obj._ATTRERR1   = '=%s*"[^"]*$'
    obj._ATTRERR2   = '=%s*\'[^\']*$'
    obj._TAGEXT     = '(%/?)>'

    obj._ENTITIES = { ["&lt;"] = "<",
                      ["&gt;"] = ">",
                      ["&amp;"] = "&",
                      ["&quot;"] = '"',
                      ["&apos;"] = "'",
                      ["&#(%d+);"] = function (x) 
                                        local d = tonumber(x)
                                        if d >= 0 and d < 256 then
                                            return string.char(d) 
                                        else
                                            return "&#"..d..";"
                                        end
                                     end,
                      ["&#x(%x+);"] = function (x) 
                                        local d = tonumber(x,16)
                                        if d >= 0 and d < 256 then
                                            return string.char(d) 
                                        else
                                            return "&#x"..x..";"
                                        end
                                      end,
                    }

    obj._err = function(self,err,pos)
                   if self.options.errorHandler then
                       self.options.errorHandler(err,pos)
                   end
               end
            
    obj._errstr = { xmlErr = "Error Parsing XML",
                    declErr = "Error Parsing XMLDecl",
                    declStartErr = "XMLDecl not at start of document",
                    declAttrErr = "Invalid XMLDecl attributes",
                    piErr = "Error Parsing Processing Instruction",
                    commentErr = "Error Parsing Comment",
                    cdataErr = "Error Parsing CDATA",
                    dtdErr = "Error Parsing DTD",
                    endTagErr = "End Tag Attributes Invalid",
                    unmatchedTagErr = "Unbalanced Tag",
                    incompleteXmlErr = "Incomplete XML Document",
                  }

    obj._stripWS = function(self,s)
        if self.options.stripWS then
            s = string.gsub(s,'^%s+','')
            s = string.gsub(s,'%s+$','')
        end
        return s
    end

    obj._parseEntities = function(self,s) 
        if self.options.expandEntities then
            --for k,v in self._ENTITIES do
            for k,v in pairs(self._ENTITIES) do
                --print (k, v) 
                s = string.gsub(s,k,v)
            end
        end
        return s
    end
            
    obj._parseDTD = function(self,s,pos)
        -- match,endmatch,root,type,name,uri,internal
        local m,e,r,t,n,u,i
        m,e,r,t,u,i = string.find(s,self._DTD1,pos)
        if m then
            return m,e,{_root=r,_type=t,_uri=u,_internal=i} 
        end
        m,e,r,t,n,u,i = string.find(s,self._DTD2,pos)
        if m then
            return m,e,{_root=r,_type=t,_name=n,_uri=u,_internal=i} 
        end
        m,e,r,i = string.find(s,self._DTD3,pos)
        if m then
            return m,e,{_root=r,_internal=i} 
        end
        m,e,r,t,u = string.find(s,self._DTD4,pos)
        if m then
            return m,e,{_root=r,_type=t,_uri=u} 
        end
        m,e,r,t,n,u = string.find(s,self._DTD5,pos)
        if m then
            return m,e,{_root=r,_type=t,_name=n,_uri=u} 
        end
        return nil
    end

    ---Parses a string representing a tag
    --@param s String containing tag text
    --@return Returns a string containing the tagname and a table attrs
    --containing the atributtes of tag
    obj._parseTag = function(self,s)
        local attrs = {}            
        local tagname = string.gsub(s,self._TAG,'%1')
        string.gsub(s,self._ATTR1,function (k,v) 
                                attrs[string.lower(k)]=self:_parseEntities(v)
                                attrs._ = 1 
                           end) 
        string.gsub(s,self._ATTR2,function (k,v) 
                                attrs[string.lower(k)]=self:_parseEntities(v)
                                attrs._ = 1 
                           end) 
        if attrs._ then
            attrs._ = nil
        else 
            attrs = nil
        end
        return tagname,attrs
    end
            
    return obj

end

return p
Advertisement