ABS383D            ; Read the a presented XML Query and tokenize it to a tree [ 12/10/1999  5:17 PM ]

            ; For CECS 383: The Science and Technology of the World Wide Web by Arthur B. Smith (754104)

       

            ; This is NOT a formal lexical analyzer for XML.  It makes many assumptions.

            ; As a result, this will accept "XML" files that are not valid (or even well-formed)

            ; And will not accept XML files that don't conform to the general structure of queries

            ; In particular, only <Expr> tags have content, so content is ignored except when in an <Expr>

            ; Another implicit assumption is that the Entire Query file is no more than 16K big.

            ; That should not be a problem in any realistic query.

       

ReadQry(FName)       ; Read the Query from the file named FName and generate a

            ; tokenized form in ^ABS383("QUERY",$JOB,"TOKENS").  If an error occurs, a "ERROR" node is

            ; inserted in ^ABS383("QUERY",$JOB,"TOKENS") at that point.

        Set NumPatrn=".1""-""1.N.1(1"".""1.N).1(1""E"".1""-""1.N)"  ; Pattern for a legal number

        Write "Reading..."

        Open 51:(FName:"R"::::"") Use 51

        Read QryTxt#16768

        If $ZC'=-1 Do

        . Close 51

        . Write !,"Error reading ",FName," at byte ",$ZB,!

        Close 51

        Write "done.",!,"Validating (sort of)..."

        Set QryTxt=$Translate(QryTxt,$Char(9,10,13),$Char(32,32,32))  ; Make all white space into spaces

        If $$Prolog(QryTxt)<0 Do  Quit

        . Write !,"Error: ",FName," is not a valid XML Query.",!

        Set QryTxt=$Extract(QryTxt,$Find(QryTxt,"<Query>")-7,$Length(QryTxt))

            ; Now set up the actions to do the processing as we parse

            ; Extract the file number from the long file name

        Set ^ABS383("END","File")="SET FileNo=$$GetFilNo(CurElem)"

            ; Initialize the NotFlag to false

        Set ^ABS383("START","Select")="SET NotFlag=0"

            ; Set the value of Select equal to it's child's value

        Set ^ABS383("END","Select")="DO Deherit(CurElem)"

            ; Distribute any NOT condition through the OR by DeMorgan's law (~(AVB) = (~A)^(~B))

        Set ^ABS383("START","OR")="IF NotFlag KILL @CurElem@(TagNo) SET @CurElem@(TagNo,""AND"")="""" SET Tag=""AND"""

            ; Process the OR (set union)

        Set ^ABS383("END","OR")="DO Union(CurElem):'NotFlag,Intrsct(CurElem):NotFlag IF NotFlag SET Tag=""AND"""

            ; Distribute any NOT condition through the AND by DeMorgan's law (~(A^B) = (~A)V(~B))

        Set ^ABS383("START","AND")="IF NotFlag KILL @CurElem@(TagNo) SET @CurElem@(TagNo,""OR"")="""" SET Tag=""OR"""

            ; Proces the AND (set intersection)

        Set ^ABS383("END","AND")="DO Intrsct(CurElem):'NotFlag,Union(CurElem):NotFlag IF NotFlag SET Tag=""OR"""

            ; Entering a NOT switches the NotFlag

        Set ^ABS383("START","NOT")="SET NotFlag='NotFlag"

            ; Exiting a NOT switches the NotFlag back and brings up the childs set of matches (already NOT'ed)

        Set ^ABS383("END","NOT")="SET NotFlag='NotFlag DO Deherit(CurElem)"

            ; If the NotFlag is set, reverse the boolean sense of all boolean operators

        Set ^ABS383("START","EQ")="IF NotFlag KILL @CurElem@(TagNo) SET @CurElem@(TagNo,""NE"")="""""

        Set ^ABS383("START","NE")="IF NotFlag KILL @CurElem@(TagNo) SET @CurElem@(TagNo,""EQ"")="""""

        Set ^ABS383("START","GT")="IF NotFlag KILL @CurElem@(TagNo) SET @CurElem@(TagNo,""LE"")="""""

        Set ^ABS383("START","LT")="IF NotFlag KILL @CurElem@(TagNo) SET @CurElem@(TagNo,""GE"")="""""

        Set ^ABS383("START","GE")="IF NotFlag KILL @CurElem@(TagNo) SET @CurElem@(TagNo,""LT"")="""""

        Set ^ABS383("START","LE")="IF NotFlag KILL @CurElem@(TagNo) SET @CurElem@(TagNo,""GT"")="""""

        Set ^ABS383("START","SA")="IF NotFlag KILL @CurElem@(TagNo) SET @CurElem@(TagNo,""SB"")="""""

            ; This is a real search on the database now -- field op value

        Set ^ABS383("END","SAtom")="DO Find(CurElem)"

            ; Extract the field number from the long field name

        Set ^ABS383("END","Field")="DO GetFldNo(CurElem)"

            ; The Expr's value is the Content of Expr (<Expr>Content</Expr>)

        Set ^ABS383("START","Expr")="SET @CurElem@(TagNo,Tag)=$$GetCont()"

            ; Build a list of field numbers in Project

        Set ^ABS383("END","Project")="DO FldList(CurElem)"

       

            ; Now do the parse (and pre/post actions)!          

        Write "done.",!,"Parsing and searching..."

        Do BldTree(QryTxt)

        Write "done.",!,"Sorting..."

        Do Sort^ABS383E("^ABS383(""PARSE"")"); Do the sort on the result tree

        Write "done.",!,"Projecting results..."

        Do GenResp^ABS383E("^ABS383(""PARSE"")")  ; Generate the XML response from the sorted tree and projection

        Write "done.",!,"Results in ",OFName,!

        Quit

       

Prolog(QryTxt)            ; EXTRINSIC FUNCTION!

            ; Parse the prolog from QryTxt (minimally) 

            ; We narrowly assume that nothing except white space follows the "</Query>" end tag

        Set ProPat=""  ; Prolog Pattern

        Set ProPat=ProPat_"1""<?xml"""  ; Must start with "<?xml" exactly

        Set ProPat=ProPat_"1."" """   ; followed by at least one white space

        Set ProPat=ProPat_"1""version"""  ; followed by "version"

        Set ProPat=ProPat_"."" ""1""=""."" """  ; followed by optional space, "=", optional space

        Set ProPat=ProPat_".E1""?>"""  ; followed by anything then a "?>"

        Set ProPat=ProPat_".E1""<!DOCTYPE""1."" """ ; followed by anything then "<!DOCTYPE" and 1+ spaces

        Set ProPat=ProPat_"1""Query"".E1"">"""  ; followed by "Query" then anything and a ">"

        Set ProPat=ProPat_".E1""<Query>"".E1""</Query>""."" """  ; followed by anything then the <Query>...</Query> Element

        If QryTxt'?@ProPat Quit -1  ; Didn't pass the pattern

        Quit 0

       

BldTree(QryTxt)          ; Parse the Query element (and it's children in QryTxt

        Set TagNo=1

        Set (CurElem,ElemTop)="^ABS383(""PARSE"")"

        Set Pos=1

        For  Do  Quit:Mode<1

        . Set Mode=$$GetTag(.Tag)

        . If Mode=1 Do  ; Start Tag

        . . Set @CurElem@(TagNo,Tag)=""

        . . Xecute $Get(^ABS383("START",Tag)) ; Do the pre-processing

        . . Set CurElem=$Name(@CurElem@(TagNo,Tag))

        . . Set TagNo=TagNo+1

        . If Mode=2 Do  ; End Tag

        . . Xecute $Get(^ABS383("END",Tag)) ; Do the post-processing

        . . If Tag'=$QSubscript(CurElem,$QLength(CurElem)) Do  Quit

        . . . Write "Error: Improper nesting of tags in XML.",!

        . . . Set Mode=-1

        . . Set CurElem=$Name(@CurElem,$QLength(CurElem)-2)

        . If Mode=3 Do  ; Empty Tag

        . . Set @CurElem@(TagNo,Tag)=""

        . . Xecute $Get(^ABS383("START",Tag)) ; Do the pre-processing

        . . Set CurElem=$Name(@CurElem@(TagNo,Tag))

        . . Set TagNo=TagNo+1

        . . Xecute $Get(^ABS383("END",Tag)) ; Do the post-processing

        . . Set CurElem=$Name(@CurElem,$QLength(CurElem)-2)

        If Mode=-1 Write "Error parsing query.",!

        Quit

       

GetTag(Tag)            ; EXTRINSIC FUNUCTION, TAG MUST BE CALL BY REFERENCE

            ; Returns Mode.  Tag is  the next element tag (sans any "<", ">" or "/" characters), Mode is

            ; 1 (Start), 2 (End) or 3 (Empty) (or 0 (EOF) or  -1 (Error))

            ; Assumes Pos and QryTxt are appropriately defined

            ; Assumes no attributes (as is true for all Query's)

        New C,White,NameP,S,Mode

        Set S=" "  ; Space

        Set NameP="1(1AN,1""."",1""-"",1""_"",1"";"")"  ; Pattern for a Name character

        Set Mode=1

        For  Do  Quit:C="<"  Quit:Mode<1

        . Set C=$Extract(QryTxt,Pos)

        . Set Pos=Pos+1

        . Set:Pos>$Length(QryTxt) Mode=0

        . Quit:C=S  ; Just eat white space

        . Set:C'="<" Mode=-1  ; 1'st non-white char has to be "<"

        Quit:Mode<1 Mode

        If $Extract(QryTxt,Pos)="/" Do

        . Set Mode=2  ; End tag

        . Set Pos=Pos+1

        Set Tag=""

        For  Do  Quit:C=">"  Quit:C="/"  Quit:Mode<1

        . Set C=$Extract(QryTxt,Pos)

        . Set Pos=Pos+1

        . Set:Pos>$Length(QryTxt) Mode=0

        . If C=S Do  Quit

        . . Set:Tag="" Mode=-1  ; Have to have tag name before white space

        . Set:C?@NameP Tag=Tag_C

        If Mode>0,C="/" Do

        . Set Mode=3  ; Empty tag

        . If $Extract(QryTxt,Pos)'=">" Set Mode=-1  ; False empty tag

        . Set Pos=Pos+1

        Quit Mode

       

GetCont()            ; EXTRINSIC

            ; Return the content. 

            ; Assumes that QryTxt and Pos are set appropriately

        New Expr

        Set Expr=$Piece($Extract(QryTxt,Pos,$Length(QryTxt)),"<",1)

        Set Pos=Pos+$Length(Expr)

        Quit Expr

       

GetFilNo(CurElem)    ; EXTRINSIC FUNCTION

            ; Returns the file number from the long form of the file reference, at the child of CurElem

        New NodeNo,Sub

        Set