"""Module for XML collections of items."""
import os
import pandas as pd
from lxml import etree
import xmlcoll.base as xb
[docs]
class Item(xb.Properties):
"""A class for storing and retrieving data about a data item.
Args:
``properties`` (:obj:`dict`, optional): A dictionary of properties.
"""
def __init__(self, name, properties=None):
super().__init__()
self.properties = {}
self.name = name
if properties:
self.update_properties(properties)
[docs]
def get_name(self):
"""Method to retrieve name of item.
Return:
:obj:`str`: The name of the item.
"""
return self.name
[docs]
class Collection(xb.Properties):
"""A class for storing and retrieving data about data items.
Args:
``items`` (:obj:`list`, optional): A list of individual
:obj:`xmlcoll.coll.Item` objects.
"""
def __init__(self, items=None):
super().__init__()
self.properties = {}
self.collection = {}
if items:
for item in items:
self.collection[item.get_name()] = item
[docs]
def add_item(self, item):
"""Method to add a item to a collection.
Args:
``item`` (:obj:`xmlcoll.coll.Item`) The item to be added.
Return:
On successful return, the item has been added.
"""
self.collection[item.get_name()] = item
[docs]
def remove_item(self, item):
"""Method to remove an item from a item collection.
Args:
``item`` (:obj:`xmlcoll.coll.Item`) The item to be removed.
Return:
On successful return, the item has been removed.
"""
self.collection.pop(item.get_name())
[docs]
def get(self):
"""Method to retrieve the item collection as a dictionary.
Returns:
:obj:`dict`: A dictionary of the items.
"""
return self.collection
[docs]
def get_dataframe(self, index_label="name", tag_delimiter="_"):
"""Method to retrieve the collection data as a pandas dataframe.
Args:
``index_label`` (:obj:`str`, optional): Index label for the
dataframe.
``tag_delimiter`` (:obj:`str`, optional): Delimiter used
to separate tags in combined column names.
Returns:
:obj:`pandas.DataFrame`: A pandas dataframe containing the
collection data. Columns are labeled by a string formed by
concatenating property names and tags separated by the chosen
delimiter.
"""
items = self.collection
v_data = []
v_index = []
for key, val in items.items():
data_line = {}
props = val.get_properties()
for prop in props:
my_str = str()
if isinstance(prop, tuple):
for i, my_prop in enumerate(prop):
my_str += my_prop
if i < len(prop) - 1:
my_str += tag_delimiter.strip()
else:
my_str = prop
data_line[my_str] = props[prop]
v_data.append(data_line)
v_index.append(key)
result = pd.DataFrame(data=v_data, index=v_index)
result.index.name = index_label
return result
[docs]
def update_from_dataframe(
self, data_frame, index_label="name", tag_delimiter="_"
):
"""Method to update collection data from a pandas dataframe.
Args:
``data_frame`` (:obj:`pandas.DataFrame`): The pandas dataframe.
``index_label`` (:obj:`str`, optional): Index label for the
data frame.
``tag_delimiter`` (:obj:`str`, optional): Delimiter used
to separate tags in combined column names.
Returns:
On successful return, the collection has been updated with
the data in the data frame.
"""
column_names = list(data_frame.columns.values)
data_frame = data_frame.reset_index()
my_cols = list(set(column_names) - set((index_label, "index")))
for _index, row in data_frame.iterrows():
if not pd.isna(row[index_label]):
item = Item(row[index_label])
props = {}
for col in my_cols:
if not pd.isna(row[col]):
result = col.split(tag_delimiter)
if len(result) == 1:
c_str = result[0]
else:
c_str = tuple(result)
props[c_str] = row[col]
item.update_properties(props)
self.add_item(item)
[docs]
def write_to_xml(self, file, pretty_print=True):
"""Method to write the collection to XML.
Args:
``file`` (:obj:`str`) The output file name.
``pretty_print`` (:obj:`bool`, optional): If set to True,
routine outputs the xml in nice indented format.
Return:
On successful return, the item collection data have been
written to the XML output file.
"""
root = etree.Element("collection")
xml = etree.ElementTree(root)
self._add_properties(root, self)
my_coll = self.get()
items = etree.SubElement(root, "items")
for val in my_coll.values():
my_item = etree.SubElement(items, "item")
my_name = etree.SubElement(my_item, "name")
my_name.text = val.get_name()
self._add_properties(my_item, val)
xml.write(file, pretty_print=pretty_print)
def _add_properties(self, my_element, my_object):
my_props = my_object.get_properties()
if len(my_props):
props = etree.SubElement(my_element, "properties")
for prop in my_props:
if isinstance(prop, str):
my_prop = etree.SubElement(props, "property", name=prop)
elif isinstance(prop, tuple):
my_prop = etree.SubElement(props, "property", name=prop[0])
for i in range(1, len(prop)):
my_tag = "tag" + str(i)
my_prop.attrib[my_tag] = prop[i]
my_prop.text = str(my_props[prop])
[docs]
def update_from_xml(self, file, xpath=""):
"""Method to update a item collection from an XML file.
Args:
``file`` (:obj:`str`) The name of the XML file from which to
update.
``xpath`` (:obj:`str`, optional): XPath expression to select
items. Defaults to all items.
Returns:
On successful return, the item collection has been updated.
"""
parser = etree.XMLParser(remove_blank_text=True)
xml = etree.parse(file, parser)
xml.xinclude()
coll = xml.getroot()
self._update_properties(coll, self)
el_item = coll.xpath("//item" + xpath)
for result in el_item:
name = result.xpath(".//name")
my_item = Item(name[0].text)
self._update_properties(result, my_item)
self.add_item(my_item)
def _update_properties(self, my_element, my_object):
el_props = my_element.xpath("properties")
if len(el_props) > 0:
props = el_props[0].xpath("property")
my_props = {}
for prop in props:
attributes = prop.attrib
if len(attributes) == 1:
my_props[attributes.values()[0]] = prop.text
else:
my_props[tuple(attributes.values())] = prop.text
my_object.update_properties(my_props)
[docs]
def validate(self, file):
"""Method to validate a collection XML file.
Args:
``file`` (:obj:`str`) The name of the XML file to validate.
Returns:
An error message if invalid and nothing if valid.
"""
parser = etree.XMLParser(remove_blank_text=True)
xml = etree.parse(file, parser)
xml.xinclude()
schema_file = os.path.join(
os.path.dirname(__file__), "xsd_pub/xmlcoll.xsd"
)
xmlschema_doc = etree.parse(schema_file)
xml_validator = etree.XMLSchema(xmlschema_doc)
xml_validator.validate(xml)