ABS383D ; Read the a presented XML Query and tokenize it to a
tree [ 12/10/1999 5:17 PM ]
; For CECS 383: The Science and Technology of the World
Wide Web by Arthur B. Smith (754104)
; This is NOT a formal lexical analyzer for XML. It makes many assumptions.
; As a result, this will accept "XML" files
that are not valid (or even well-formed)
; And will not accept XML files that don't conform to the
general structure of queries
; In particular, only <Expr> tags have content, so
content is ignored except when in an <Expr>
; Another implicit assumption is that the Entire Query
file is no more than 16K big.
; That should not be a problem in any realistic query.
ReadQry(FName) ; Read the Query from the file named FName and generate a
; tokenized form in
^ABS383("QUERY",$JOB,"TOKENS"). If an error occurs, a "ERROR" node is
; inserted in
^ABS383("QUERY",$JOB,"TOKENS") at that point.
Set
NumPatrn=".1""-""1.N.1(1"".""1.N).1(1""E"".1""-""1.N)" ; Pattern for a legal number
Write
"Reading..."
Open
51:(FName:"R"::::"") Use 51
Read
QryTxt#16768
If
$ZC'=-1 Do
. Close
51
. Write
!,"Error
reading ",FName," at byte
",$ZB,!
Close
51
Write
"done.",!,"Validating
(sort of)..."
Set
QryTxt=$Translate(QryTxt,$Char(9,10,13),$Char(32,32,32)) ; Make all white space into spaces
If
$$Prolog(QryTxt)<0 Do Quit
. Write
!,"Error:
",FName," is not a
valid XML Query.",!
Set
QryTxt=$Extract(QryTxt,$Find(QryTxt,"<Query>")-7,$Length(QryTxt))
; Now set up the actions to do the processing as we parse
; Extract the file number from the long file name
Set
^ABS383("END","File")="SET
FileNo=$$GetFilNo(CurElem)"
; Initialize the NotFlag to false
Set
^ABS383("START","Select")="SET
NotFlag=0"
; Set the value of Select equal to it's child's value
Set
^ABS383("END","Select")="DO
Deherit(CurElem)"
; Distribute any NOT condition through the OR by
DeMorgan's law (~(AVB) = (~A)^(~B))
Set
^ABS383("START","OR")="IF
NotFlag KILL @CurElem@(TagNo) SET @CurElem@(TagNo,""AND"")=""""
SET Tag=""AND"""
; Process the OR (set union)
Set
^ABS383("END","OR")="DO
Union(CurElem):'NotFlag,Intrsct(CurElem):NotFlag IF NotFlag SET
Tag=""AND"""
; Distribute any NOT condition through the AND by
DeMorgan's law (~(A^B) = (~A)V(~B))
Set
^ABS383("START","AND")="IF
NotFlag KILL @CurElem@(TagNo) SET
@CurElem@(TagNo,""OR"")="""" SET
Tag=""OR"""
; Proces the AND (set intersection)
Set
^ABS383("END","AND")="DO
Intrsct(CurElem):'NotFlag,Union(CurElem):NotFlag IF NotFlag SET
Tag=""OR"""
; Entering a NOT switches the NotFlag
Set
^ABS383("START","NOT")="SET
NotFlag='NotFlag"
; Exiting a NOT switches the NotFlag back and brings up
the childs set of matches (already NOT'ed)
Set
^ABS383("END","NOT")="SET
NotFlag='NotFlag DO Deherit(CurElem)"
; If the NotFlag is set, reverse the boolean sense of all
boolean operators
Set
^ABS383("START","EQ")="IF
NotFlag KILL @CurElem@(TagNo) SET
@CurElem@(TagNo,""NE"")="""""
Set
^ABS383("START","NE")="IF
NotFlag KILL @CurElem@(TagNo) SET
@CurElem@(TagNo,""EQ"")="""""
Set
^ABS383("START","GT")="IF
NotFlag KILL @CurElem@(TagNo) SET
@CurElem@(TagNo,""LE"")="""""
Set
^ABS383("START","LT")="IF
NotFlag KILL @CurElem@(TagNo) SET @CurElem@(TagNo,""GE"")="""""
Set
^ABS383("START","GE")="IF
NotFlag KILL @CurElem@(TagNo) SET
@CurElem@(TagNo,""LT"")="""""
Set
^ABS383("START","LE")="IF
NotFlag KILL @CurElem@(TagNo) SET
@CurElem@(TagNo,""GT"")="""""
Set
^ABS383("START","SA")="IF
NotFlag KILL @CurElem@(TagNo) SET
@CurElem@(TagNo,""SB"")="""""
; This is a real search on the database now -- field op
value
Set
^ABS383("END","SAtom")="DO
Find(CurElem)"
; Extract the field number from the long field name
Set
^ABS383("END","Field")="DO
GetFldNo(CurElem)"
; The Expr's value is the Content of Expr
(<Expr>Content</Expr>)
Set
^ABS383("START","Expr")="SET
@CurElem@(TagNo,Tag)=$$GetCont()"
; Build a list of field numbers in Project
Set
^ABS383("END","Project")="DO
FldList(CurElem)"
; Now do the parse (and pre/post actions)!
Write
"done.",!,"Parsing
and searching..."
Do
BldTree(QryTxt)
Write
"done.",!,"Sorting..."
Do
Sort^ABS383E("^ABS383(""PARSE"")"); Do the sort on the result
tree
Write
"done.",!,"Projecting
results..."
Do
GenResp^ABS383E("^ABS383(""PARSE"")") ; Generate the XML response from the sorted tree and projection
Write
"done.",!,"Results
in ",OFName,!
Quit
Prolog(QryTxt) ; EXTRINSIC FUNCTION!
; Parse the prolog from QryTxt (minimally)
; We narrowly assume that nothing except white space
follows the "</Query>" end tag
Set
ProPat="" ; Prolog Pattern
Set
ProPat=ProPat_"1""<?xml""" ; Must start with "<?xml"
exactly
Set
ProPat=ProPat_"1.""
""" ; followed by at least
one white space
Set
ProPat=ProPat_"1""version""" ; followed by "version"
Set
ProPat=ProPat_".""
""1""=""."" """ ; followed by optional space, "=",
optional space
Set
ProPat=ProPat_".E1""?>""" ; followed by anything then a
"?>"
Set
ProPat=ProPat_".E1""<!DOCTYPE""1.""
""" ; followed by anything then "<!DOCTYPE" and 1+
spaces
Set
ProPat=ProPat_"1""Query"".E1"">""" ; followed by "Query" then
anything and a ">"
Set
ProPat=ProPat_".E1""<Query>"".E1""</Query>"".""
""" ; followed by anything
then the <Query>...</Query> Element
If
QryTxt'?@ProPat Quit -1 ; Didn't pass the pattern
Quit
0
BldTree(QryTxt) ; Parse the Query element (and it's children in QryTxt
Set
TagNo=1
Set
(CurElem,ElemTop)="^ABS383(""PARSE"")"
Set
Pos=1
For Do Quit:Mode<1
. Set
Mode=$$GetTag(.Tag)
. If
Mode=1 Do ; Start Tag
. . Set
@CurElem@(TagNo,Tag)=""
. . Xecute
$Get(^ABS383("START",Tag)) ; Do the pre-processing
. . Set
CurElem=$Name(@CurElem@(TagNo,Tag))
. . Set
TagNo=TagNo+1
. If
Mode=2 Do ; End Tag
. . Xecute
$Get(^ABS383("END",Tag)) ; Do the post-processing
. . If
Tag'=$QSubscript(CurElem,$QLength(CurElem)) Do Quit
. . . Write "Error: Improper nesting of tags in
XML.",!
. . . Set Mode=-1
. . Set
CurElem=$Name(@CurElem,$QLength(CurElem)-2)
. If
Mode=3 Do ; Empty Tag
. . Set
@CurElem@(TagNo,Tag)=""
. . Xecute
$Get(^ABS383("START",Tag)) ; Do the pre-processing
. . Set
CurElem=$Name(@CurElem@(TagNo,Tag))
. . Set
TagNo=TagNo+1
. . Xecute
$Get(^ABS383("END",Tag)) ; Do the post-processing
. . Set
CurElem=$Name(@CurElem,$QLength(CurElem)-2)
If
Mode=-1 Write "Error
parsing query.",!
Quit
GetTag(Tag) ; EXTRINSIC FUNUCTION, TAG MUST BE CALL BY REFERENCE
; Returns Mode.
Tag is the next element tag
(sans any "<", ">" or "/" characters), Mode
is
; 1 (Start), 2 (End) or 3 (Empty) (or 0 (EOF) or -1 (Error))
; Assumes Pos and QryTxt are appropriately defined
; Assumes no attributes (as is true for all Query's)
New
C,White,NameP,S,Mode
Set
S=" " ; Space
Set
NameP="1(1AN,1""."",1""-"",1""_"",1"";"")" ; Pattern for a Name character
Set
Mode=1
For Do Quit:C="<" Quit:Mode<1
. Set
C=$Extract(QryTxt,Pos)
. Set
Pos=Pos+1
. Set:Pos>$Length(QryTxt) Mode=0
. Quit:C=S ; Just eat white space
. Set:C'="<" Mode=-1 ; 1'st non-white char has to be
"<"
Quit:Mode<1 Mode
If
$Extract(QryTxt,Pos)="/" Do
. Set
Mode=2 ; End tag
. Set
Pos=Pos+1
Set
Tag=""
For Do Quit:C=">" Quit:C="/" Quit:Mode<1
. Set
C=$Extract(QryTxt,Pos)
. Set
Pos=Pos+1
. Set:Pos>$Length(QryTxt) Mode=0
. If
C=S Do Quit
. . Set:Tag="" Mode=-1 ; Have to have tag name before white space
. Set:C?@NameP Tag=Tag_C
If
Mode>0,C="/" Do
. Set
Mode=3 ; Empty tag
. If
$Extract(QryTxt,Pos)'=">" Set Mode=-1 ; False empty tag
. Set
Pos=Pos+1
Quit
Mode
GetCont() ; EXTRINSIC
; Return the content.
; Assumes that QryTxt and Pos are set appropriately
New
Expr
Set
Expr=$Piece($Extract(QryTxt,Pos,$Length(QryTxt)),"<",1)
Set
Pos=Pos+$Length(Expr)
Quit
Expr
GetFilNo(CurElem) ; EXTRINSIC FUNCTION
; Returns the file number from the long form of the file
reference, at the child of CurElem
New
NodeNo,Sub
Set