Package Bio :: Package Phylo :: Module CDAOIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.CDAOIO

  1  # Copyright (C) 2013 by Ben Morris (ben@bendmorris.com) 
  2  # Based on Bio.Nexus, copyright 2005-2008 by Frank Kauff & Cymon J. Cox 
  3  # and Bio.Phylo.Newick, copyright 2009 by Eric Talevich. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license. Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8   
  9  """I/O function wrappers for the RDF/CDAO file format. 
 10   
 11  This is an RDF format that conforms to the Comparative Data Analysis Ontology (CDAO). 
 12  See: http://www.evolutionaryontology.org/cdao 
 13   
 14  This module requires the librdf Python bindings (http://www.librdf.org) 
 15   
 16  The CDAOIO.Parser, in addition to parsing text files, can also parse directly 
 17  from a triple store that implements the Redland storage interface; similarly, 
 18  the CDAOIO.Writer can store triples in a triple store instead of serializing 
 19  them to a file. 
 20  """ 
 21   
 22  from Bio._py3k import StringIO 
 23   
 24  from Bio.Phylo import CDAO 
 25  from ._cdao_owl import cdao_elements, cdao_namespaces, resolve_uri 
 26  import os 
27 28 29 -class CDAOError(Exception):
30 """Exception raised when CDAO object construction cannot continue.""" 31 pass
32 33 try: 34 import rdflib 35 rdfver = rdflib.__version__ 36 if rdfver[0] in ["1", "2"] or (rdfver in ["3.0.0", "3.1.0", "3.2.0"]): 37 raise CDAOError( 38 'Support for CDAO tree format requires RDFlib v3.2.1 or later.') 39 except ImportError: 40 raise CDAOError('Support for CDAO tree format requires RDFlib.') 41 42 RDF_NAMESPACES = { 43 'owl': 'http://www.w3.org/2002/07/owl#', 44 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 45 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 46 } 47 RDF_NAMESPACES.update(cdao_namespaces) 48 # pad node ids with zeroes until they're at least this length 49 ZEROES = 8
50 51 52 -def qUri(x):
53 return resolve_uri(x, namespaces=RDF_NAMESPACES)
54
55 56 -def format_label(x):
57 return x.replace('_', ' ')
58
59 60 # --------------------------------------------------------- 61 # Public API 62 63 -def parse(handle, **kwargs):
64 """Iterate over the trees in a CDAO file handle. 65 66 :returns: generator of Bio.Phylo.CDAO.Tree objects. 67 """ 68 return Parser(handle).parse(**kwargs)
69
70 71 -def write(trees, handle, plain=False, **kwargs):
72 """Write a trees in CDAO format to the given file handle. 73 74 :returns: number of trees written. 75 """ 76 return Writer(trees).write(handle, plain=plain, **kwargs)
77
78 79 # --------------------------------------------------------- 80 # Input 81 82 -class Parser(object):
83 """Parse a CDAO tree given a file handle.""" 84
85 - def __init__(self, handle=None):
86 self.handle = handle 87 self.graph = None 88 self.node_info = None 89 self.children = {} 90 self.rooted = False
91 92 @classmethod
93 - def from_string(cls, treetext):
94 handle = StringIO(treetext) 95 return cls(handle)
96
97 - def parse(self, **kwargs):
98 """Parse the text stream this object was initialized with.""" 99 self.parse_handle_to_graph(**kwargs) 100 return self.parse_graph()
101
102 - def parse_handle_to_graph(self, rooted=False, 103 parse_format='turtle', context=None, **kwargs):
104 """Parse self.handle into RDF model self.model.""" 105 106 if self.graph is None: 107 self.graph = rdflib.Graph() 108 graph = self.graph 109 110 for k, v in RDF_NAMESPACES.items(): 111 graph.bind(k, v) 112 113 self.rooted = rooted 114 115 if 'base_uri' in kwargs: 116 base_uri = kwargs['base_uri'] 117 else: 118 # Windows style slashes cannot be used in an RDF URI 119 base_uri = "file://" + os.path.abspath(self.handle.name).replace("\\", "/") 120 121 graph.parse(file=self.handle, publicID=base_uri, format=parse_format) 122 123 return self.parse_graph(graph, context=context)
124
125 - def parse_graph(self, graph=None, context=None):
126 """Generator that yields CDAO.Tree instances from an RDF model.""" 127 128 if graph is None: 129 graph = self.graph 130 131 # look up branch lengths/TUs for all nodes 132 self.get_node_info(graph, context=context) 133 134 for root_node in self.tree_roots: 135 clade = self.parse_children(root_node) 136 137 yield CDAO.Tree(root=clade, rooted=self.rooted)
138
139 - def new_clade(self, node):
140 """Returns a CDAO.Clade object for a given named node.""" 141 142 result = self.node_info[node] 143 144 kwargs = {} 145 if 'branch_length' in result: 146 kwargs['branch_length'] = result['branch_length'] 147 if 'label' in result: 148 kwargs['name'] = result['label'].replace('_', ' ') 149 if 'confidence' in result: 150 kwargs['confidence'] = result['confidence'] 151 152 clade = CDAO.Clade(**kwargs) 153 154 return clade
155
156 - def get_node_info(self, graph, context=None):
157 """Creates a dictionary containing information about all nodes in the tree.""" 158 159 self.node_info = {} 160 self.obj_info = {} 161 self.children = {} 162 self.nodes = set() 163 self.tree_roots = set() 164 165 assignments = { 166 qUri('cdao:has_Parent'): 'parent', 167 qUri('cdao:belongs_to_Edge_as_Child'): 'edge', 168 qUri('cdao:has_Annotation'): 'annotation', 169 qUri('cdao:has_Value'): 'value', 170 qUri('cdao:represents_TU'): 'tu', 171 qUri('rdfs:label'): 'label', 172 qUri('cdao:has_Support_Value'): 'confidence', 173 } 174 175 for s, v, o in graph: 176 # process each RDF triple in the graph sequentially 177 178 s, v, o = str(s), str(v), str(o) 179 180 if s not in self.obj_info: 181 self.obj_info[s] = {} 182 this = self.obj_info[s] 183 184 try: 185 # if the predicate is one we care about, store information for 186 # later 187 this[assignments[v]] = o 188 except KeyError: 189 pass 190 191 if v == qUri('rdf:type'): 192 if o in (qUri('cdao:AncestralNode'), qUri('cdao:TerminalNode')): 193 # this is a tree node; store it in set of all nodes 194 self.nodes.add(s) 195 if v == qUri('cdao:has_Root'): 196 # this is a tree; store its root in set of all tree roots 197 self.tree_roots.add(o) 198 199 for node in self.nodes: 200 # for each node, look up all information needed to create a 201 # CDAO.Clade 202 self.node_info[node] = {} 203 node_info = self.node_info[node] 204 205 obj = self.obj_info[node] 206 if 'edge' in obj: 207 # if this object points to an edge, we need a branch length from 208 # the annotation on that edge 209 edge = self.obj_info[obj['edge']] 210 if 'annotation' in edge: 211 annotation = self.obj_info[edge['annotation']] 212 if 'value' in annotation: 213 node_info['branch_length'] = float(annotation['value']) 214 215 if 'tu' in obj: 216 # if this object points to a TU, we need the label of that TU 217 tu = self.obj_info[obj['tu']] 218 if 'label' in tu: 219 node_info['label'] = tu['label'] 220 221 if 'parent' in obj: 222 # store this node as a child of its parent, if it has one, 223 # so that the tree can be traversed from parent to children 224 parent = obj['parent'] 225 if parent not in self.children: 226 self.children[parent] = [] 227 self.children[parent].append(node)
228
229 - def parse_children(self, node):
230 """Traverse the tree to create a nested clade structure. 231 232 Return a CDAO.Clade, and calls itself recursively for each child, 233 traversing the entire tree and creating a nested structure of CDAO.Clade 234 objects. 235 """ 236 237 clade = self.new_clade(node) 238 239 children = self.children[node] if node in self.children else [] 240 clade.clades = [ 241 self.parse_children(child_node) for child_node in children] 242 243 return clade
244
245 246 # --------------------------------------------------------- 247 # Output 248 249 -class Writer(object):
250 """Based on the writer in Bio.Nexus.Trees (str, to_string).""" 251 prefixes = RDF_NAMESPACES 252
253 - def __init__(self, trees):
254 self.trees = trees 255 256 self.node_counter = 0 257 self.edge_counter = 0 258 self.tu_counter = 0 259 self.tree_counter = 0
260
261 - def write(self, handle, tree_uri='', record_complete_ancestry=False, 262 rooted=False, **kwargs):
263 """Write this instance's trees to a file handle.""" 264 265 self.rooted = rooted 266 self.record_complete_ancestry = record_complete_ancestry 267 268 if tree_uri and not tree_uri.endswith('/'): 269 tree_uri += '/' 270 271 trees = self.trees 272 273 if tree_uri: 274 handle.write('@base <%s>\n' % tree_uri) 275 for k, v in self.prefixes.items(): 276 handle.write('@prefix %s: <%s> .\n' % (k, v)) 277 278 handle.write('<%s> a owl:Ontology .\n' % self.prefixes['cdao']) 279 280 for tree in trees: 281 self.tree_counter += 1 282 self.tree_uri = 'tree%s' 283 284 first_clade = tree.clade 285 statements = self.process_clade(first_clade, root=tree) 286 for stmt in statements: 287 self.add_stmt_to_handle(handle, stmt)
288
289 - def add_stmt_to_handle(self, handle, stmt):
290 # apply URI prefixes 291 stmt_strings = [] 292 for n, part in enumerate(stmt): 293 if isinstance(part, rdflib.URIRef): 294 node_uri = str(part) 295 changed = False 296 for prefix, uri in self.prefixes.items(): 297 if node_uri.startswith(uri): 298 node_uri = node_uri.replace(uri, '%s:' % prefix, 1) 299 if node_uri == 'rdf:type': 300 node_uri = 'a' 301 changed = True 302 if changed or ':' in node_uri: 303 stmt_strings.append(node_uri) 304 else: 305 stmt_strings.append('<%s>' % node_uri) 306 307 elif isinstance(part, rdflib.Literal): 308 stmt_strings.append(part.n3()) 309 310 else: 311 stmt_strings.append(str(part)) 312 313 handle.write('%s .\n' % ' '.join(stmt_strings))
314
315 - def process_clade(self, clade, parent=None, root=False):
316 """recursively generate triples describing a tree of clades""" 317 318 self.node_counter += 1 319 clade.uri = 'node%s' % str(self.node_counter).zfill(ZEROES) 320 if parent: 321 clade.ancestors = parent.ancestors + [parent.uri] 322 else: 323 clade.ancestors = [] 324 325 nUri = lambda s: rdflib.URIRef(s) 326 pUri = lambda s: rdflib.URIRef(qUri(s)) 327 tree_id = nUri('') 328 329 statements = [] 330 331 if root is not False: 332 # create a cdao:RootedTree with reference to the tree root 333 tree_type = pUri('cdao:RootedTree') if self.rooted else pUri( 334 'cdao:UnrootedTree') 335 336 statements += [ 337 (tree_id, pUri('rdf:type'), tree_type), 338 (tree_id, pUri('cdao:has_Root'), nUri(clade.uri)), 339 ] 340 341 try: 342 tree_attributes = root.attributes 343 except AttributeError: 344 tree_attributes = [] 345 346 for predicate, obj in tree_attributes: 347 statements.append((tree_id, predicate, obj)) 348 349 if clade.name: 350 # create TU 351 self.tu_counter += 1 352 tu_uri = 'tu%s' % str(self.tu_counter).zfill(ZEROES) 353 354 statements += [ 355 (nUri(tu_uri), pUri('rdf:type'), pUri('cdao:TU')), 356 (nUri(clade.uri), pUri( 357 'cdao:represents_TU'), nUri(tu_uri)), 358 (nUri(tu_uri), pUri('rdfs:label'), 359 rdflib.Literal(format_label(clade.name))), 360 ] 361 362 try: 363 tu_attributes = clade.tu_attributes 364 except AttributeError: 365 tu_attributes = [] 366 367 for predicate, obj in tu_attributes: 368 yield (nUri(tu_uri), predicate, obj) 369 370 # create this node 371 node_type = 'cdao:TerminalNode' if clade.is_terminal( 372 ) else 'cdao:AncestralNode' 373 statements += [ 374 (nUri(clade.uri), pUri('rdf:type'), pUri(node_type)), 375 (nUri(clade.uri), pUri( 376 'cdao:belongs_to_Tree'), tree_id), 377 ] 378 379 if parent is not None: 380 # create edge from the parent node to this node 381 self.edge_counter += 1 382 edge_uri = 'edge%s' % str(self.edge_counter).zfill(ZEROES) 383 384 statements += [ 385 (nUri(edge_uri), pUri('rdf:type'), pUri('cdao:DirectedEdge')), 386 (nUri(edge_uri), pUri( 387 'cdao:belongs_to_Tree'), tree_id), 388 (nUri(edge_uri), pUri('cdao:has_Parent_Node'), 389 nUri(parent.uri)), 390 (nUri(edge_uri), pUri('cdao:has_Child_Node'), 391 nUri(clade.uri)), 392 (nUri(clade.uri), pUri( 393 'cdao:belongs_to_Edge_as_Child'), nUri(edge_uri)), 394 (nUri(clade.uri), pUri('cdao:has_Parent'), 395 nUri(parent.uri)), 396 (nUri(parent.uri), pUri( 397 'cdao:belongs_to_Edge_as_Parent'), nUri(edge_uri)), 398 ] 399 400 if hasattr(clade, 'confidence') and clade.confidence is not None: 401 confidence = rdflib.Literal( 402 clade.confidence, datatype='http://www.w3.org/2001/XMLSchema#decimal') 403 404 statements += [(nUri(clade.uri), 405 pUri('cdao:has_Support_Value'), confidence)] 406 407 if self.record_complete_ancestry and len(clade.ancestors) > 0: 408 statements += [(nUri(clade.uri), pUri('cdao:has_Ancestor'), nUri(ancestor)) 409 for ancestor in clade.ancestors] 410 411 if clade.branch_length is not None: 412 # add branch length 413 edge_ann_uri = 'edge_annotation%s' % str( 414 self.edge_counter).zfill(ZEROES) 415 416 branch_length = rdflib.Literal(clade.branch_length, datatype=rdflib.URIRef( 417 'http://www.w3.org/2001/XMLSchema#decimal')) 418 statements += [ 419 (nUri(edge_ann_uri), pUri('rdf:type'), 420 pUri('cdao:EdgeLength')), 421 (nUri(edge_uri), pUri('cdao:has_Annotation'), 422 nUri(edge_ann_uri)), 423 (nUri(edge_ann_uri), 424 pUri('cdao:has_Value'), branch_length), 425 ] 426 427 try: 428 edge_attributes = clade.edge_attributes 429 except AttributeError: 430 edge_attributes = [] 431 432 for predicate, obj in edge_attributes: 433 yield (nUri(edge_uri), predicate, obj) 434 435 for stmt in statements: 436 yield stmt 437 438 try: 439 clade_attributes = clade.attributes 440 except AttributeError: 441 clade_attributes = [] 442 443 for predicate, obj in clade_attributes: 444 yield (nUri(clade.uri), predicate, obj) 445 446 if not clade.is_terminal(): 447 for new_clade in clade.clades: 448 for stmt in self.process_clade(new_clade, parent=clade, root=False): 449 yield stmt
450