<a href="https://colab.research.google.com/github/lucasgneccoh/BDSS_Dauphine/blob/main/notebooks/solutions/BDSS_TD1_XML_DTD_solutions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bases de données semi-structurées - TD 1 - XML and DTD

Main teacher: **Dario COLAZZO**

Teaching Assistant: **Lucas GNECCO**

Special thanks to **Beatrice NAPOLITANO**

Université Paris Dauphine - PSL

# TODO:

    - Make the notebook self contained, bring content from the pdf

# Introduction

Welcome!

In this notebook we will practice XML file validation with respect to a DTD and XPath to query XML documents.

Here is some important documentation and resources

XPath 3.1 

Beware that the XPath we use in our TD (the one in the lxml Python library) is XPath 1.0, so some things migh be different. The main aspects should however remain valid 

 
XML documentation by W3C 

https://www.w3.org/TR/2008/REC-xml-20081126/ 


XPath 3.1

https://www.w3.org/TR/2017/REC-xpath-31-20170321/ 
 

Regarding the XML namespaces 

https://www.w3schools.com/xml/xml_namespaces.asp 

 

Regarding the mixed content in Ex 1, 1.2 

http://www.featureblend.com/dtd-elements-mixed-content.html 

https://www.w3.org/TR/xml/#sec-mixed-content 

 
Regarding the axes in XPath 

https://www.w3schools.com/xml/xpath_axes.asp 

# Preambule
Import modules, define functions

Run this code to be able to run all the other things

In [None]:
# Check if lxml is installed. If it is not, install it using pip
!pip list | grep lxml

In [None]:
from lxml import etree
import re

In [None]:
# Functions to work with XML files

def validate_xml(xml_path:str, dtd_path:str) -> bool:
    ''' Validate an XML file  against a DTD using the lxml library
    '''
    try:
        dtd = etree.DTD(open(dtd_path))
    except etree.DTDParseError as ed:
        print(f"DTDParseError: {ed}")
        for i, er in enumerate(ed.error_log):
            print(f"\t{i}-> {er.message}, at line {er.line}")
        etree.clear_error_log()
        return False

    try:
        xml_doc = etree.parse(xml_path)
    except etree.XMLSyntaxError as e:
        print(f"XMLSyntaxError: {e}")
        for i, er in enumerate(e.error_log):
            print(f"\t{i}-> {er.message}, at line {er.line}")
        etree.clear_error_log()
        return False

    result = dtd.validate(xml_doc)
    if not result: print(dtd.error_log[0])

    return result

def write_xml_dtd_files_from_strings(xml_strings, dtd_strings, identifiers = None):
    ''' Write a list of strings into files. This strings should be XML and DTD files
    '''

    # If single strings are given, encapsulate them in lists  
    if all(map(lambda o: isinstance(o, str), [xml_strings, dtd_strings])):
        xml_strings, dtd_strings = [xml_strings], [dtd_strings]

    if len(xml_strings) != len(dtd_strings):
        raise Exception("Different number of XML and DTD strings!")

    # If no identifiers are given, create default ones. This determines file names
    if identifiers is None:
        identifiers = [f"file_{i}" for i in range(len(xml_strings))]

    try:
        for x, d, id in zip(xml_strings, dtd_strings, identifiers):
            xml_path, dtd_path = f"{id}.xml", f"{id}.dtd" 
            with open(xml_path,"w") as f:
                f.write(x)
            with open(dtd_path,"w") as f:
                f.write(d)
    except Exception as e:
        print("Problems while writing XML and DTD files")
        raise e

    return identifiers



def test_validation(xml_string, dtd_string, validator):
    ''' Validate an XML document against a DTD, both given as strings
    '''
    # Write files
    write_xml_dtd_files_from_strings(xml_string, dtd_string, identifiers = ['temp'])
    
    # Validate
    return validator("temp.xml", "temp.dtd" )

def xpath_query_xml_string(xml_string, query_string):
    xml_path = "xml_doc.xml"
    with open(xml_path, "w") as f:
        # Remove all whitespaces to keep the 'real' text of each node
        f.write(re.sub(">[\s|\n]*<", "><", xml_string))
        f.close()
    xml_doc = etree.parse(xml_path)
    query = etree.XPath(query_string)
    return query(xml_doc)

def xpath_query_xml_file(xml_path, query_string):
    xml_doc = etree.parse(xml_path)
    query = etree.XPath(query_string)
    return query(xml_doc)


def print_xpath_query_results(results):
    print(f"Total results: {len(results)}")
    print("*"*20 + "\n")
    for e in results:
        try:        
            print(f"node tag: {e.tag}")
            print(f"node text: *{e.text}*")
            print(', '.join([f"{k} = {v}"for k, v in e.items()]))
            print("-"*20)
        except:
            print("--Except")
            print(e)

## Save XML and DTD files from strings

In [None]:
# Dummy files to test with

xml_strings, dtd_strings, identifiers = [],[],[]

### ---- FILE 1 ----
#dtddoc.dtd
dtd_string = \
'''<!ELEMENT address (name,company,phone)>
<!ELEMENT name (#PCDATA)>
<!ELEMENT company (#PCDATA)>
<!ELEMENT phone (#PCDATA)>'''

# xmldoc.xml
xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE address SYSTEM "dtddoc.dtd">
<address>
    <name>Beatrice </name>
    <company>Paris-Dauphine</company>
    <phone>06 12345678</phone>
</address>'''

dtd_strings.append(dtd_string)
xml_strings.append(xml_string)
identifiers.append("address_book")

### ---- FILE 2 ----

#dtddoc.dtd
dtd_string = \
'''<!ELEMENT address EMPTY>
<!ATTLIST address name CDATA #REQUIRED>
<!ATTLIST address company CDATA #IMPLIED>
<!ATTLIST address phone CDATA #REQUIRED>'''

# xmldoc.xml
xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE address SYSTEM "dtddoc2.dtd">
<address name="Beatrice" phone="06 12345678"/>'''

dtd_strings.append(dtd_string)
xml_strings.append(xml_string)
identifiers.append("address_book_2")


In [None]:
# Write all the files to the temporal memory of the Colab session
test_files = write_xml_dtd_files_from_strings(xml_strings, dtd_strings, identifiers)

Test XML validation

In [None]:
# Test validation using strings
test_validation(xml_string, dtd_string, validate_xml)

In [None]:
# Test with the files. 
# Make sure you saved the files using the previous section
xml_path, dtd_path = f"{test_files[0]}.xml", f"{test_files[0]}.dtd"
validate_xml(xml_path, dtd_path)

# Excercise 1
Decide if the XML documents are correct. To validate them, the corresponding DTD file must be defined

In [None]:
# ---- Ex 1.1 ----


#dtddoc.dtd
dtd_string = \
'''<!ELEMENT html (head,body)>
<!ELEMENT head (title)>
<!ELEMENT title (#PCDATA)>
<!ELEMENT body (p)>
<!ELEMENT p (#PCDATA)>
'''

# xmldoc.xml.
# XML document is correct !

xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
<html>
    <head>
        <title>Hello, World</title>
    </head>
    <body>
        <p>Hello, World</p>        
    </body>
</html>'''

xml_string_correct = xml_string

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.2 ----


#dtddoc.dtd
dtd_string = \
'''<!ELEMENT p ANY>
<!ELEMENT strong ANY>
<!ELEMENT em ANY>
'''

# xmldoc.xml. 

xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>    
    <p> This is a test. This is a test of the <em>
    <strong>Emergency</em> Broadcast System.</strong></p>'''

# Tags are not correctly nested
# This can be fixed in numerous ways, this one seemed natural to me
xml_string_correct = \
'''<?xml version="1.0" encoding="UTF-8"?>    
    <p> This is a test. This is a test of the 
    <strong> <em>Emergency</em> Broadcast System.</strong></p>'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.3 ----
#dtddoc.dtd
dtd_string = \
'''<!ELEMENT root (note*)>
<!ELEMENT note (message)>
<!ATTLIST note date CDATA #IMPLIED>
<!ELEMENT message (to,from,heading,body)>
<!ELEMENT to (#PCDATA)>
<!ELEMENT from (#PCDATA)>
<!ELEMENT heading (#PCDATA)>
<!ELEMENT body (#PCDATA)>
'''

# xmldoc.xml. 

xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
    <note date="12/11/2007">
        <!-- This is a comment -->
        <Message>
            <to>Tove</to>
            <from>Jani</from>
            <heading>Reminder</heading>
            <body>Dont forget me this weekend!</body>
        </message>
    </note>
    <note date="13/11/2007">
        <message>
            <to>Jani</to>
            <from>Tove</from>
            <heading>Re: Reminder</heading>
            <body>Ok!</body>
        </message>
    </note>'''

# Message tag starting with uppercase
# More important, there must be only one root! In this case I add a root element
# on top of the notes
xml_string_correct = \
'''<?xml version="1.0" encoding="UTF-8"?>
    <root>
        <note date="12/11/2007">
            <!-- This is a comment -->
            <message>
                <to>Tove</to>
                <from>Jani</from>
                <heading>Reminder</heading>
                <body>Dont forget me this weekend!</body>
            </message>
        </note>
        <note date="13/11/2007">
            <message>
                <to>Jani</to>
                <from>Tove</from>
                <heading>Re: Reminder</heading>
                <body>Ok!</body>
            </message>
        </note>
    </root>'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.4 ----
#dtddoc.dtd
dtd_string = \
'''<!ELEMENT xs:schema ANY>
<!ATTLIST xs:schema xmlns:xs CDATA #FIXED "http://www.w3.org/2001/XMLSchema">
<!ATTLIST xs:schema attributeFormDefault CDATA #IMPLIED>
<!ATTLIST xs:schema elementFormDefault CDATA #IMPLIED>
<!ELEMENT xs:element ANY>
<!ATTLIST xs:element maxOccurs CDATA #IMPLIED>
<!ATTLIST xs:element name CDATA #IMPLIED>
<!ELEMENT xs:sequence ANY>
<!ELEMENT xs:complexType ANY>
<!ELEMENT xs:attribute ANY>
<!ATTLIST xs:attribute name CDATA #REQUIRED>
<!ATTLIST xs:attribute type CDATA "string">
<!ATTLIST xs:attribute use CDATA #IMPLIED>
'''

# xmldoc.xml. 
# XML document is correct ! Make sure to add the namespace in the DTD file
# as an attribute

xml_string = \
'''<?xml version="1.0" encoding="utf-8"?>
<xs:schema attributeFormDefault="unqualified" elementFormDefault="qualified" 
xmlns:xs="http://www.w3.org/2001/XMLSchema">
    <xs:element name="points">
        <xs:complexType>
            <xs:sequence>
                <xs:element maxOccurs="unbounded" name="point">
                    <xs:complexType>
                        <xs:attribute name="x" type="xs:unsignedShort" use="required" />
                        xs:attribute name="y" type="xs:unsignedShort" use="required" />
                    </xs:complexType>
                </xs:element>
            </xs:sequence>
        </xs:complexType>
    </xs:element>
</xs:schema>'''

xml_string_correct = xml_string

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.5 ----
#dtddoc.dtd
dtd_string = \
'''<!ELEMENT html (head,body)>
<!ELEMENT head (title)>
<!ELEMENT title (#PCDATA)>
<!ELEMENT body (p*)>
<!ELEMENT p ANY>
<!ELEMENT br ANY>
'''

# xmldoc.xml. 

xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
<html>
    <head><title>Paragraphs</title></head>
    <body>
        <p>This is a paragraph.<br/>
        <p>This is another paragraph.<br/>
        <p>Third paragraph.
    </body>
</html>'''

# Paragraphs were open but not closed.
# Either add three p tags, or leave the line breaks <br/> and leave only one
# paragraph
xml_string_correct = \
'''<?xml version="1.0" encoding="UTF-8"?>
<html>
    <head><title>Paragraphs</title></head>
    <body>
        <p>
        This is a paragraph.<br/>
        This is another paragraph.<br/>
        Third paragraph.
        </p>
    </body>
</html>'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.6 ----
#dtddoc.dtd
dtd_string = \
'''<!ELEMENT rdf:RDF ANY>
<!ATTLIST rdf:RDF xmlns:rdf CDATA #FIXED "http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<!ATTLIST rdf:RDF xmlns:dc CDATA #FIXED "http://purl.org/dc/elements/1.1/">
<!ELEMENT rdf:Description ANY>
<!ATTLIST rdf:Description rdf:about CDATA #IMPLIED>
<!ELEMENT dc:title ANY>
<!ELEMENT dc:description ANY>
<!ELEMENT dc:subject ANY>
<!ELEMENT rdf:Bag ANY>
<!ELEMENT rdf:li ANY>
'''

# xmldoc.xml. 

xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:dc="http://purl.org/dc/elements/1.1/">
    <rdf:Description rdf:about=http://www.AcronymFinder.com/>
        <dc:title>Acronym Finder</dc:title>
        <dc:description>The Acronym Finder is a world wide
            web (WWW) searchable database of more than 169,000
            abbreviations and acronyms about computers,
            technology, telecommunications, and military
            acronyms and abbreviations.</dc:description>
        <dc:subject>
            <rdf:Bag>
                <rdf:li>Astronomy</rdf:li>
                <rdf:li>Literature</rdf:li>
                <rdf:li>Mathematics</rdf:li>
                <rdf:li>Music</rdf:li>
                <rdf:li>Philosophy</rdf:li>
            </rdf:Bag>
        </dc:subject>
    </rdf:Description>
</rdf:RDF>'''

# In the rdf:Description tag, the attribute rdf:about must be a string inside quotes
xml_string_correct = \
'''<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:dc="http://purl.org/dc/elements/1.1/">
    <rdf:Description rdf:about="http://www.AcronymFinder.com/">
        <dc:title>Acronym Finder</dc:title>
        <dc:description>The Acronym Finder is a world wide
            web (WWW) searchable database of more than 169,000
            abbreviations and acronyms about computers,
            technology, telecommunications, and military
            acronyms and abbreviations.</dc:description>
        <dc:subject>
            <rdf:Bag>
                <rdf:li>Astronomy</rdf:li>
                <rdf:li>Literature</rdf:li>
                <rdf:li>Mathematics</rdf:li>
                <rdf:li>Music</rdf:li>
                <rdf:li>Philosophy</rdf:li>
            </rdf:Bag>
        </dc:subject>
    </rdf:Description>
</rdf:RDF>'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.7 ----
#dtddoc.dtd
dtd_string = \
'''<!ELEMENT html (body)>
    <!ELEMENT body ANY>
    <!ELEMENT p ANY>
    <!ELEMENT i ANY>
    <!ELEMENT b ANY>
    <!ELEMENT br ANY>
'''

# xmldoc.xml. 

xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
    <html>
        <body>
            <p><b><i>This paragraph is bold and italic.</b></i></p><br/>
            <p><i><b>This paragraph is italic and bold.</i></b></p><br/>
        </body>
    </html>'''

# b and i closing tags were in the wrong order
xml_string_correct = \
'''<?xml version="1.0" encoding="UTF-8"?>
    <html>
        <body>
            <p><b><i>This paragraph is bold and italic.</i></b></p><br/>
            <p><i><b>This paragraph is italic and bold.</b></i></p><br/>
        </body>
    </html>'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.8 ----
#dtddoc.dtd
dtd_string = \
'''<!ELEMENT catalog (work*)>
    <!ELEMENT work (title,author*)>
    <!ATTLIST work type CDATA #IMPLIED>
    <!ATTLIST work date CDATA #IMPLIED>
    <!ELEMENT title (#PCDATA)>
    <!ELEMENT author (#PCDATA)>
'''

# xmldoc.xml. 

xml_string = \
'''<catalog>
    <work type='prose' date='1906'>
        <title>The Gift Of The Magi</title>
        <author>O. Henry</author>
    </work>
    <work type='poem' date='1845'>
        <title>The Raven</title>
        <author>Edgar Allen Poe</author>
    </work>
    <work type='play' date='1601'>
        <title>Hamlet</title>
        <author>William Shakespeare</author>
    </work>
</catalog>'''

# Here the result will be that the file is OK, but technically speaking, 
# it is not correct because it needs a preambule
xml_string_correct = \
'''<catalog>
    <work type="prose" date="1906">
        <title>The Gift Of The Magi</title>
        <author>O. Henry</author>
    </work>
    <work type="poem" date="1845">
        <title>The Raven</title>
        <author>Edgar Allen Poe</author>
    </work>
    <work type="play" date="1601">
        <title>Hamlet</title>
        <author>William Shakespeare</author>
    </work>
</catalog>'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.9----
#dtddoc.dtd
dtd_string = \
'''<!ELEMENT letter (date?,addressee,greeting?,(paragraph|list)*,closing?)>
    <!ELEMENT date (#PCDATA)>
    <!ELEMENT addressee ANY>
    <!ELEMENT name (#PCDATA)>
    <!ELEMENT address_one (#PCDATA)>
    <!ELEMENT address_two (#PCDATA)>
    <!ELEMENT greeting ANY>
    <!ELEMENT paragraph ANY>
    <!ELEMENT italics (#PCDATA)>
    <!ELEMENT list (item*)>
    <!ELEMENT item ANY>
    <!ELEMENT closing ANY>
    
'''

# xmldoc.xml. 
# XML document is correct !

xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
<letter>
    <date>December 11, 2002</date>
    <addressee>
        <name>Melvile Dewey</name>
        <address_one>Columbia University</address_one>
        <address_two>New York, NY</address_two>
    </addressee>
    <greeting>Dear Melvile,</greeting>
    <paragraph>I have been reading your ideas concerning nature of
    librarianship, and <italics>I find them very intriguing</italics>.
    I would love the opportunity to discuss with you the role of the
    card catalog in today’s libraries considering the advent to World
    Wide Web. Specifically, how are things like Google and Amazon.com
    changing our patrons’ expectations of library services? Mr. Cutter
    and I will be discussing these ideas at the next Annual Meeting,
    and we are available at the follow dates/times:</paragraph>
    <list>
    <item>Monday, 2-4</item>
    <item>Tuesday, 3-5</item>
    <item>Thursday, 1-3</item>
    </list>
    <paragraph>We hope you can join us.</paragraph>
    <closing>Sincerely, S. R. Ranganathan</closing>
</letter>'''

xml_string_correct = xml_string

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.10----
#dtddoc.dtd
dtd_string = \
'''<!ELEMENT dictionary (word*)>
    <!ELEMENT word (update,name,description*,definition*)>
    <!ELEMENT update ANY>
    <!ATTLIST update date CDATA #IMPLIED>
    <!ELEMENT name (#PCDATA)>
    <!ATTLIST name is_acronym CDATA "false">
    <!ELEMENT description (#PCDATA)>
    <!ELEMENT definition (#PCDATA)>
    <!ATTLIST definition default CDATA #IMPLIED>
    
    
'''

# xmldoc.xml. 


xml_string = \
'''<?xml version="1.0"?>
<dictionary>
    <word>
        <update date="2002-12-23"/>
        <name is_acronym="true">XML</Name>
        <description>eXtensible Markup Language</description>
    </word>
    <word>
        <update date="2002-12-23"/>
        <name is_acronym="true">POP</name>
        <definition default>Post Office Protocol</definition>
        <definition>Point Of Purchase</definition>
</dictionary>'''

# Mistakes in lines 5 and 11
# Close second word, first word has closing name tag as Name.
# Also, when giving the value to the 'default' attribute in the 
# definition element, it should be given properly
xml_string_correct = \
'''<?xml version="1.0"?>
<dictionary>
    <word>
        <update date="2002-12-23"/>
        <name is_acronym="true">XML</name>
        <description>eXtensible Markup Language</description>
    </word>
    <word>
        <update date="2002-12-23"/>
        <name is_acronym="true">POP</name>
        <definition default="true">Post Office Protocol</definition>
        <definition>Point Of Purchase</definition>
    </word>
</dictionary>'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.11----
#dtddoc.dtd
dtd_string = \
'''<!ELEMENT domain ANY>
    <!ATTLIST domain type CDATA #IMPLIED>
    <!ELEMENT name ANY>
    <!ELEMENT memory ANY>
    <!ELEMENT vcpu ANY>
    <!ELEMENT features ANY>
    <!ELEMENT acpi ANY>
    <!ELEMENT pae ANY>
    <!ELEMENT clock ANY>
    <!ATTLIST clock offset CDATA #IMPLIED>
    <!ELEMENT disk ANY>
    <!ATTLIST disk type CDATA #IMPLIED>
    <!ATTLIST disk device CDATA #IMPLIED>
    <!ELEMENT driver ANY>
    <!ATTLIST driver name CDATA #IMPLIED>
    <!ATTLIST driver type CDATA #IMPLIED>
    <!ELEMENT source ANY>
    <!ATTLIST source file CDATA #IMPLIED>
    <!ELEMENT target ANY>
    <!ATTLIST target dev CDATA #IMPLIED>
    <!ATTLIST target bus CDATA #IMPLIED>
    <!ELEMENT readonly ANY>
'''

# xmldoc.xml. 

xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
<domain type='kvm>
    <name>domain</name><
    <memory>524288</memory>
    <vcpu>2</vcpu>
    <features><acpi/><pae/>
    <clock offset='utc'>
    <disk type='block' device='cdrom'>
        <driver name='qemu' type='raw'/>
        <source file='/path/to/image.iso'/>
        <tar get dev='hdc' bus='ide'/>
        <readonly/></name>
    </disk>
</domain>'''

# Mistakes in lines 2,3,6,11,12 
# line 2: Close the quotes
# line 3: open < without closure at the end of the line
# line 6: Close features
# Close clock
# Extra </name>
# target has a space in between, or the tag is tar and get is an attribute?
xml_string_correct = \
'''<?xml version="1.0" encoding="UTF-8"?>
<domain type="kvm">
    <name>domain</name>
    <memory>524288</memory>
    <vcpu>2</vcpu>
    <features><acpi/><pae/></features>
    <clock offset="utc"/>
    <disk type="block" device="cdrom">
        <driver name="qemu" type="raw"/>
        <source file="/path/to/image.iso"/>
        <target dev="hdc" bus="ide"/>
        <readonly/>
    </disk>
</domain>'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

In [None]:
# ---- Ex 1.12----
#dtddoc.dtd
dtd_string = \
''' <!ELEMENT root ANY>
    <!ELEMENT name (#PCDATA)>
    <!ELEMENT author (#PCDATA)>
    <!ELEMENT copyright (#PCDATA)>
    <!ATTLIST copyright holder CDATA #IMPLIED>
    <!ELEMENT ingredients ANY>
    <!ELEMENT list (item*)>
    <!ELEMENT item (#PCDATA)>
    <!ELEMENT cost (#PCDATA)>
    <!ELEMENT process ANY>
    <!ELEMENT p ANY>
    <!ELEMENT i ANY>
    

'''

# xmldoc.xml. 


xml_string = \
'''<?xml version="1.0" encoding="UTF-8"?>
<name>Oyster Soup</name>
<author>Eric Lease Morgan</author>
<copyright holder=Eric Lease Morgan>&copy; 2003</copyright>
<ingredients>
<list>
<item>1 stalk of celery
<item>1 onion
<item>2 tablespoons of butter
<item>2 cups of oysters and their liquor
<item>2 cups of half & half
</list><cost>total cost < 36 euro </cost>
</ingredients>
<process><P>Begin by sauteing the celery and onions in butter until soft.
Add oysters, oyster liquor, and cream. Heat until the oysters float.
Serve in warm bowls.</p>
<p><i>Yummy!</p></i>
</process>'''

# Missing root, use quotes for attributes, there is an & that should be included
# using &amp
# Close item tags in list
# escape characters properly using the predefined entities
# P tag instead of p
xml_string_correct = \
'''<?xml version="1.0" encoding="UTF-8"?>
<root>
<name>Oyster Soup</name>
<author>Eric Lease Morgan</author>
<copyright holder="Eric Lease Morgan">&amp; copy; 2003</copyright>
<ingredients>
<list>
<item>1 stalk of celery</item>
<item>1 onion</item>
<item>2 tablespoons of butter</item>
<item>2 cups of oysters and their liquor</item>
<item>2 cups of half &amp; half</item>
</list><cost>total cost &lt; 36 euro </cost>
</ingredients>
<process><p>Begin by sauteing the celery and onions in butter until soft.
Add oysters, oyster liquor, and cream. Heat until the oysters float.
Serve in warm bowls.</p>
<p><i>Yummy!</i></p>
</process>
</root>'''

print(test_validation(xml_string, dtd_string, validate_xml))
print(" ***** Corrected ******")
print(test_validation(xml_string_correct, dtd_string, validate_xml))

# Excercise 2
Write a DTD file for the given XML file

In [None]:
xml_string = '''<?xml version="1.0"?>
<shiporder orderid="889923">
    <orderperson>John Smith</orderperson>
    <shipto>
        <name>Ola Nordmann</name>
        <address>Langgt 23</address>
        <city>4000 Stavanger</city>
        <country>Norway</country>
    </shipto>
    <item>
        <title>Empire Burlesque</title>
        <note>Special Edition</note>
        <quantity>1</quantity>
        <price>10.90</price>
    </item>
    <item>
        <title>Hide your heart</title>
        <quantity>1</quantity>
        <price>9.90</price>
    </item>
</shiporder>'''

dtd_string = '''
    <!ELEMENT shiporder (orderperson,shipto,item*)>
    <!ATTLIST shiporder orderid CDATA #REQUIRED>
    <!ELEMENT orderperson (#PCDATA)>
    <!ELEMENT shipto (name,address,city,country)>
    <!ELEMENT name (#PCDATA)>
    <!ELEMENT address (#PCDATA)>
    <!ELEMENT city (#PCDATA)>
    <!ELEMENT country (#PCDATA)>
    <!ELEMENT item (title,note?,quantity,price)>
    <!ELEMENT title (#PCDATA)>
    <!ELEMENT note (#PCDATA)>
    <!ELEMENT quantity (#PCDATA)>
    <!ELEMENT price (#PCDATA)>
'''


test_validation(xml_string, dtd_string, validate_xml)

# Exercise 3
Write a XML file for the given DTD file

In [None]:
dtd_string = \
''' <!ELEMENT stock (new-car | used-car)*>
    <!ELEMENT new-car (model, price)>
    <!ELEMENT used-car (model, price, mileage, condition?)>
    <!ELEMENT model (#PCDATA)>
    <!ELEMENT price (#PCDATA)>
    <!ELEMENT mileage (#PCDATA)>
    <!ELEMENT condition (#PCDATA)>
'''

xml_string = \
'''<?xml version="1.0"?>
    <stock>
        <new-car>
            <model>Renault Twingo</model>
            <price>1234</price>
        </new-car>            
        <new-car>
            <model>Peugout 206</model>
            <price>2345</price>
        </new-car>
        <used-car>
            <model>Nissan Pathfinder</model>
            <price>3500</price>
            <mileage>100000</mileage>
        </used-car>
        <used-car>
            <model>Volkswagen Tiguan</model>
            <price>2000</price>
            <mileage>50000</mileage>
            <condition>good</condition>
        </used-car>
    </stock>
'''
test_validation(xml_string, dtd_string, validate_xml)

# Exercise 4
Create a XML file and the corresponding DTD file following the described situation

In [None]:
dtd_string = \
''' <!ELEMENT games (game*)>
    <!ELEMENT game (home,away)>
    <!ELEMENT home (player+,goal*,yellow*,red*)>
    <!ELEMENT away (player+,goal*,yellow*,red*)>
    <!ELEMENT player (name,number)>
    <!ELEMENT name (#PCDATA)>
    <!ELEMENT goal (number,time)>
    <!ELEMENT yellow (number,time)>
    <!ELEMENT red (number,time)>
    <!ELEMENT number (#PCDATA)>
    <!ELEMENT time (#PCDATA)>
'''

xml_string = \
'''<?xml version="1.0"?>
    <games>
        <game>
            <home>
                <player>
                    <name>Home player 1</name>
                    <number>1</number>
                </player>
                <player>
                    <name>Home player 2</name>
                    <number>2</number>
                </player>
                <player>
                    <name>Home player 3</name>
                    <number>3</number>
                </player>
                <goal>
                    <number>3</number>
                    <time>62</time>
                </goal>
                <yellow>
                    <number>1</number>
                    <time>13</time>
                </yellow>
            </home>
            <away>
                <player>
                    <name>Away player 1</name>
                    <number>1</number>
                </player>
                <player>
                    <name>Away player 2</name>
                    <number>2</number>
                </player>
                <player>
                    <name>Away player 3</name>
                    <number>3</number>
                </player>            
                <yellow>
                    <number>2</number>
                    <time>22</time>
                </yellow>
                <red>
                    <number>3</number>
                    <time>89</time>
                </red>
            </away>
        </game>
    </games>
'''
test_validation(xml_string, dtd_string, validate_xml)

# Exercise 5
XPath axis: write the nodes that will be given as the result of each axis

To see this, we will replicate the tree with a dummy XML file and do all the queries

In [None]:
dtd_string = \
''' <!ELEMENT root ANY>
    <!ELEMENT node ANY>
    <!ATTLIST node id CDATA #IMPLIED>
    <!ATTLIST node attribute CDATA #IMPLIED>
    <!ELEMENT text ANY>
'''

xml_string = \
'''<?xml version="1.0"?>
    <root>
        <node id="1" attribute="node 2 - value 7">
            <node id="3">
                <node id="8">
                    <text>17</text>
                </node>
                <node id="9">
                </node>
            </node>
            <node id="4" attribute="node 10 - value None">
                <node id="11" attribute="node 18 - value None">
                    <text>19</text>
                </node>
                <node id="12" attribute="node 20 - value None">
                    <node id="21">
                    </node>
                    <node id="22" attribute="node 25 - value None">
                        <text>26</text>
                    </node>
                    <node id="23">                        
                    </node>
                    <text>24</text>
                </node>
            </node>
            <node id="5">
                <node id="13">
                </node>
                <node id="14">                    
                </node>
            </node>
            <node id="6" attribute="node 15 - value None">                
                <text>16</text>
            </node>
        </node>
    </root>
'''
test_validation(xml_string, dtd_string, validate_xml)

In [None]:
""" Write the query here """

query_string = '//node[@id=12]/ancestor::*'
query_string = '//node[@id=12]/ancestor-or-self::*'
query_string = '//node[@id=12]/attribute::*'
query_string = '//node[@id=12]/child::*'
query_string = '//node[@id=12]/descendant::*'
query_string = '//node[@id=12]/descendant-or-self::*'
query_string = '//node[@id=12]/following::*'
query_string = '//node[@id=12]/following-sibling::*'
query_string = '//node[@id=12]/parent::*'
query_string = '//node[@id=12]/preceding::*'
query_string = '//node[@id=12]/preceding-sibling::*'
query_string = '//node[@id=12]/self::*'

# -------------------------------------------

results = xpath_query_xml_string(xml_string, query_string)
print_xpath_query_results(results)

# Exercise 6
Create an XML file where both queries yield the same result

In [None]:
dtd_string = \
''' <!ELEMENT root ANY>
    <!ELEMENT cours ANY>
    <!ATTLIST cours id CDATA #IMPLIED>
    <!ELEMENT intitule ANY>
    <!ELEMENT XML ANY>
'''

xml_string = \
'''<?xml version="1.0"?>
    <root>
        <cours id="1">
            <intitule>XML</intitule>
            <XML>XML</XML>
        </cours>
        <cours id="2">
            <intitule>something</intitule>
        </cours>
    </root>
'''
test_validation(xml_string, dtd_string, validate_xml)

In [None]:
query_string_1 = "//cours[intitule='XML']"
query_string_2 = "//cours[intitule=XML]"

# First query
results = xpath_query_xml_string(xml_string, query_string_1)
print_xpath_query_results(results)

print("\n" + "~"*40 + "\n")

# Second query
results = xpath_query_xml_string(xml_string, query_string_2)
print_xpath_query_results(results)

# Exercise 7

Explain the difference between the two queries and show a document where they yield different outputs

In [None]:
dtd_string = \
''' <!ELEMENT root ANY>
    <!ELEMENT A ANY>
    <!ATTLIST A id CDATA #IMPLIED>
    <!ELEMENT B ANY>
    <!ATTLIST B id CDATA #IMPLIED>
'''


xml_string = \
'''<?xml version="1.0"?>    
    <A id="1">
        <B id="2">
            <B id ="3"></B>
            <B id ="4"></B>
        </B>          
    </A>    
'''
test_validation(xml_string, dtd_string, validate_xml)

In [None]:
query_string_1 = "//B[position()=1]"
query_string_1_extended = "/descendant-or-self::node()/B[position()=1]"
query_string_2 = "/descendant::B[position()=1]"

# First query
results = xpath_query_xml_string(xml_string, query_string_1)
print_xpath_query_results(results)

print("\n" + "~"*40 + "\n")

# First query extended
results = xpath_query_xml_string(xml_string, query_string_1_extended)
print_xpath_query_results(results)

print("\n" + "~"*40 + "\n")

# Second query
results = xpath_query_xml_string(xml_string, query_string_2)
print_xpath_query_results(results)


# Exercise 8
Perform the XPath queries to the Films database

The cell below downloads the files so that you can work on them later

In [None]:
dtd_link = "https://raw.githubusercontent.com/lucasgneccoh/BDSS_Dauphine/main/data/films.dtd"
xml_link = "https://raw.githubusercontent.com/lucasgneccoh/BDSS_Dauphine/main/data/films.xml"

!rm "./films.dtd"
!rm "./films.xml"

# Download the imdb sample file
!wget {dtd_link}
!wget {xml_link}

# If the download fails, you will have to load the files into the Colab session. 
# Go to the Files section on the left panel

if validate_xml("films.xml", "films.dtd"):
    print("Files were downloaded correctly")

In [None]:
""" Write the query here """

# 1
query_string = '//TITRE'

# 2
query_string = '/FILMS/FILM[@Annee=1980]/TITRE'

# 3
query_string = '/FILMS/FILM[TITRE="Alien"]/RESUME'

# 4 (Two options)
query_string = '/FILMS/FILM/ROLES/ROLE[PRENOM="Bruce" and NOM="Willis"]/parent::node()/parent::node()/TITRE'
query_string = '/FILMS/FILM[.//NOM="Willis" and .//PRENOM="Bruce"]/TITRE'

# 5
query_string = '/FILMS/FILM[RESUME]/TITRE'

# 6
query_string = '/FILMS/FILM[not(RESUME)]/TITRE'

# 7
query_string = '/FILMS/FILM[2022-@Annee>30]/TITRE'

# 8 (Two options, one is case depending)
query_string = '/FILMS/FILM[TITRE="Reservoir Dogs"]/ROLES/ROLE[NOM="Keitel" and PRENOM="Harvey"]/INTITULE'
query_string = '/FILMS/FILM[translate(TITRE, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")="reservoir dogs"]/ROLES/ROLE[NOM="Keitel" and PRENOM="Harvey"]/INTITULE'


# 9
query_string = '/FILMS/FILM[position()=last()]/TITRE'

# 10 (For one file there is The Shining, for the other only Shining)
query_string = '/FILMS/FILM[translate(TITRE, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")="shining"]/preceding::FILM[position()=1]/TITRE'
query_string = '/FILMS/FILM[TITRE="Shining"]/preceding::FILM[position()=1]/TITRE'

# 11
query_string = '/FILMS/ARTISTE[@id_art=/FILMS/FILM[TITRE="Vertigo"]/MES/@id_mes]/ACTNOM'

# 12
query_string = '/FILMS/FILM/TITRE[contains(text(), "S")]'

# 13
query_string = '//*[count(descendant::*)=3]'

# 14
query_string = '//*[contains(name(), "TU")]'

# -------------------------------------------
xml_path = "films.xml"
results = xpath_query_xml_file(xml_path, query_string)
print_xpath_query_results(results)