Package Bio :: Package Phylo :: Module CDAOIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.CDAOIO

  1  # Copyright (C) 2013 by Ben Morris (ben@bendmorris.com) 
  2  # Based on Bio.Nexus, copyright 2005-2008 by Frank Kauff & Cymon J. Cox 
  3  # and Bio.Phylo.Newick, copyright 2009 by Eric Talevich. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license. Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8   
  9  """I/O function wrappers for the RDF/CDAO file format. 
 10   
 11  This is an RDF format that conforms to the Comparative Data Analysis Ontology (CDAO). 
 12  See: http://www.evolutionaryontology.org/cdao 
 13   
 14  This module requires the librdf Python bindings (http://www.librdf.org) 
 15   
 16  The CDAOIO.Parser, in addition to parsing text files, can also parse directly 
 17  from a triple store that implements the Redland storage interface; similarly, 
 18  the CDAOIO.Writer can store triples in a triple store instead of serializing 
 19  them to a file. 
 20  """ 
 21   
 22  from Bio._py3k import StringIO 
 23   
 24  from Bio.Phylo import CDAO 
 25  from ._cdao_owl import cdao_elements, cdao_namespaces, resolve_uri 
 26  import os 
27 28 29 -class CDAOError(Exception):
30 """Exception raised when CDAO object construction cannot continue.""" 31 pass
32 33 try: 34 import rdflib 35 rdfver = rdflib.__version__ 36 if rdfver[0] in ["1", "2"] or (rdfver in ["3.0.0", "3.1.0", "3.2.0"]): 37 raise CDAOError( 38 'Support for CDAO tree format requires RDFlib v3.2.1 or later.') 39 except ImportError: 40 raise CDAOError('Support for CDAO tree format requires RDFlib.') 41 42 RDF_NAMESPACES = { 43 'owl': 'http://www.w3.org/2002/07/owl#', 44 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 45 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 46 } 47 RDF_NAMESPACES.update(cdao_namespaces) 48 # pad node ids with zeroes until they're at least this length 49 ZEROES = 8
50 51 52 -def qUri(x):
53 return resolve_uri(x, namespaces=RDF_NAMESPACES)
54
55 56 -def format_label(x):
57 return x.replace('_', ' ')
58
59 60 # --------------------------------------------------------- 61 # Public API 62 63 -def parse(handle, **kwargs):
64 """Iterate over the trees in a CDAO file handle. 65 66 :returns: generator of Bio.Phylo.CDAO.Tree objects. 67 """ 68 return Parser(handle).parse(**kwargs)
69
70 71 -def write(trees, handle, plain=False, **kwargs):
72 """Write a trees in CDAO format to the given file handle. 73 74 :returns: number of trees written. 75 """ 76 return Writer(trees).write(handle, plain=plain, **kwargs)
77
78 79 # --------------------------------------------------------- 80 # Input 81 82 -class Parser(object):
83 """Parse a CDAO tree given a file handle.""" 84
85 - def __init__(self, handle=None):
86 self.handle = handle 87 self.graph = None 88 self.node_info = None 89 self.children = {} 90 self.rooted = False
91 92 @classmethod
93 - def from_string(cls, treetext):
94 handle = StringIO(treetext) 95 return cls(handle)
96
97 - def parse(self, **kwargs):
98 """Parse the text stream this object was initialized with.""" 99 self.parse_handle_to_graph(**kwargs) 100 return self.parse_graph()
101
102 - def parse_handle_to_graph(self, rooted=False, 103 parse_format='turtle', context=None, **kwargs):
104 """Parse self.handle into RDF model self.model.""" 105 if self.graph is None: 106 self.graph = rdflib.Graph() 107 graph = self.graph 108 109 for k, v in RDF_NAMESPACES.items(): 110 graph.bind(k, v) 111 112 self.rooted = rooted 113 114 if 'base_uri' in kwargs: 115 base_uri = kwargs['base_uri'] 116 else: 117 # Windows style slashes cannot be used in an RDF URI 118 base_uri = "file://" + os.path.abspath(self.handle.name).replace("\\", "/") 119 120 graph.parse(file=self.handle, publicID=base_uri, format=parse_format) 121 122 return self.parse_graph(graph, context=context)
123
124 - def parse_graph(self, graph=None, context=None):
125 """Generator that yields CDAO.Tree instances from an RDF model.""" 126 if graph is None: 127 graph = self.graph 128 129 # look up branch lengths/TUs for all nodes 130 self.get_node_info(graph, context=context) 131 132 for root_node in self.tree_roots: 133 clade = self.parse_children(root_node) 134 135 yield CDAO.Tree(root=clade, rooted=self.rooted)
136
137 - def new_clade(self, node):
138 """Returns a CDAO.Clade object for a given named node.""" 139 result = self.node_info[node] 140 141 kwargs = {} 142 if 'branch_length' in result: 143 kwargs['branch_length'] = result['branch_length'] 144 if 'label' in result: 145 kwargs['name'] = result['label'].replace('_', ' ') 146 if 'confidence' in result: 147 kwargs['confidence'] = result['confidence'] 148 149 clade = CDAO.Clade(**kwargs) 150 151 return clade
152
153 - def get_node_info(self, graph, context=None):
154 """Creates a dictionary containing information about all nodes in the tree.""" 155 self.node_info = {} 156 self.obj_info = {} 157 self.children = {} 158 self.nodes = set() 159 self.tree_roots = set() 160 161 assignments = { 162 qUri('cdao:has_Parent'): 'parent', 163 qUri('cdao:belongs_to_Edge_as_Child'): 'edge', 164 qUri('cdao:has_Annotation'): 'annotation', 165 qUri('cdao:has_Value'): 'value', 166 qUri('cdao:represents_TU'): 'tu', 167 qUri('rdfs:label'): 'label', 168 qUri('cdao:has_Support_Value'): 'confidence', 169 } 170 171 for s, v, o in graph: 172 # process each RDF triple in the graph sequentially 173 174 s, v, o = str(s), str(v), str(o) 175 176 if s not in self.obj_info: 177 self.obj_info[s] = {} 178 this = self.obj_info[s] 179 180 try: 181 # if the predicate is one we care about, store information for 182 # later 183 this[assignments[v]] = o 184 except KeyError: 185 pass 186 187 if v == qUri('rdf:type'): 188 if o in (qUri('cdao:AncestralNode'), qUri('cdao:TerminalNode')): 189 # this is a tree node; store it in set of all nodes 190 self.nodes.add(s) 191 if v == qUri('cdao:has_Root'): 192 # this is a tree; store its root in set of all tree roots 193 self.tree_roots.add(o) 194 195 for node in self.nodes: 196 # for each node, look up all information needed to create a 197 # CDAO.Clade 198 self.node_info[node] = {} 199 node_info = self.node_info[node] 200 201 obj = self.obj_info[node] 202 if 'edge' in obj: 203 # if this object points to an edge, we need a branch length from 204 # the annotation on that edge 205 edge = self.obj_info[obj['edge']] 206 if 'annotation' in edge: 207 annotation = self.obj_info[edge['annotation']] 208 if 'value' in annotation: 209 node_info['branch_length'] = float(annotation['value']) 210 211 if 'tu' in obj: 212 # if this object points to a TU, we need the label of that TU 213 tu = self.obj_info[obj['tu']] 214 if 'label' in tu: 215 node_info['label'] = tu['label'] 216 217 if 'parent' in obj: 218 # store this node as a child of its parent, if it has one, 219 # so that the tree can be traversed from parent to children 220 parent = obj['parent'] 221 if parent not in self.children: 222 self.children[parent] = [] 223 self.children[parent].append(node)
224
225 - def parse_children(self, node):
226 """Traverse the tree to create a nested clade structure. 227 228 Return a CDAO.Clade, and calls itself recursively for each child, 229 traversing the entire tree and creating a nested structure of CDAO.Clade 230 objects. 231 """ 232 clade = self.new_clade(node) 233 234 children = self.children[node] if node in self.children else [] 235 clade.clades = [ 236 self.parse_children(child_node) for child_node in children] 237 238 return clade
239
240 241 # --------------------------------------------------------- 242 # Output 243 244 -class Writer(object):
245 """Based on the writer in Bio.Nexus.Trees (str, to_string).""" 246 prefixes = RDF_NAMESPACES 247
248 - def __init__(self, trees):
249 self.trees = trees 250 251 self.node_counter = 0 252 self.edge_counter = 0 253 self.tu_counter = 0 254 self.tree_counter = 0
255
256 - def write(self, handle, tree_uri='', record_complete_ancestry=False, 257 rooted=False, **kwargs):
258 """Write this instance's trees to a file handle.""" 259 self.rooted = rooted 260 self.record_complete_ancestry = record_complete_ancestry 261 262 if tree_uri and not tree_uri.endswith('/'): 263 tree_uri += '/' 264 265 trees = self.trees 266 267 if tree_uri: 268 handle.write('@base <%s>\n' % tree_uri) 269 for k, v in self.prefixes.items(): 270 handle.write('@prefix %s: <%s> .\n' % (k, v)) 271 272 handle.write('<%s> a owl:Ontology .\n' % self.prefixes['cdao']) 273 274 for tree in trees: 275 self.tree_counter += 1 276 self.tree_uri = 'tree%s' 277 278 first_clade = tree.clade 279 statements = self.process_clade(first_clade, root=tree) 280 for stmt in statements: 281 self.add_stmt_to_handle(handle, stmt)
282
283 - def add_stmt_to_handle(self, handle, stmt):
284 # apply URI prefixes 285 stmt_strings = [] 286 for n, part in enumerate(stmt): 287 if isinstance(part, rdflib.URIRef): 288 node_uri = str(part) 289 changed = False 290 for prefix, uri in self.prefixes.items(): 291 if node_uri.startswith(uri): 292 node_uri = node_uri.replace(uri, '%s:' % prefix, 1) 293 if node_uri == 'rdf:type': 294 node_uri = 'a' 295 changed = True 296 if changed or ':' in node_uri: 297 stmt_strings.append(node_uri) 298 else: 299 stmt_strings.append('<%s>' % node_uri) 300 301 elif isinstance(part, rdflib.Literal): 302 stmt_strings.append(part.n3()) 303 304 else: 305 stmt_strings.append(str(part)) 306 307 handle.write('%s .\n' % ' '.join(stmt_strings))
308
309 - def process_clade(self, clade, parent=None, root=False):
310 """recursively generate triples describing a tree of clades""" 311 self.node_counter += 1 312 clade.uri = 'node%s' % str(self.node_counter).zfill(ZEROES) 313 if parent: 314 clade.ancestors = parent.ancestors + [parent.uri] 315 else: 316 clade.ancestors = [] 317 318 nUri = lambda s: rdflib.URIRef(s) 319 pUri = lambda s: rdflib.URIRef(qUri(s)) 320 tree_id = nUri('') 321 322 statements = [] 323 324 if root is not False: 325 # create a cdao:RootedTree with reference to the tree root 326 tree_type = pUri('cdao:RootedTree') if self.rooted else pUri( 327 'cdao:UnrootedTree') 328 329 statements += [ 330 (tree_id, pUri('rdf:type'), tree_type), 331 (tree_id, pUri('cdao:has_Root'), nUri(clade.uri)), 332 ] 333 334 try: 335 tree_attributes = root.attributes 336 except AttributeError: 337 tree_attributes = [] 338 339 for predicate, obj in tree_attributes: 340 statements.append((tree_id, predicate, obj)) 341 342 if clade.name: 343 # create TU 344 self.tu_counter += 1 345 tu_uri = 'tu%s' % str(self.tu_counter).zfill(ZEROES) 346 347 statements += [ 348 (nUri(tu_uri), pUri('rdf:type'), pUri('cdao:TU')), 349 (nUri(clade.uri), pUri( 350 'cdao:represents_TU'), nUri(tu_uri)), 351 (nUri(tu_uri), pUri('rdfs:label'), 352 rdflib.Literal(format_label(clade.name))), 353 ] 354 355 try: 356 tu_attributes = clade.tu_attributes 357 except AttributeError: 358 tu_attributes = [] 359 360 for predicate, obj in tu_attributes: 361 yield (nUri(tu_uri), predicate, obj) 362 363 # create this node 364 node_type = 'cdao:TerminalNode' if clade.is_terminal( 365 ) else 'cdao:AncestralNode' 366 statements += [ 367 (nUri(clade.uri), pUri('rdf:type'), pUri(node_type)), 368 (nUri(clade.uri), pUri( 369 'cdao:belongs_to_Tree'), tree_id), 370 ] 371 372 if parent is not None: 373 # create edge from the parent node to this node 374 self.edge_counter += 1 375 edge_uri = 'edge%s' % str(self.edge_counter).zfill(ZEROES) 376 377 statements += [ 378 (nUri(edge_uri), pUri('rdf:type'), pUri('cdao:DirectedEdge')), 379 (nUri(edge_uri), pUri( 380 'cdao:belongs_to_Tree'), tree_id), 381 (nUri(edge_uri), pUri('cdao:has_Parent_Node'), 382 nUri(parent.uri)), 383 (nUri(edge_uri), pUri('cdao:has_Child_Node'), 384 nUri(clade.uri)), 385 (nUri(clade.uri), pUri( 386 'cdao:belongs_to_Edge_as_Child'), nUri(edge_uri)), 387 (nUri(clade.uri), pUri('cdao:has_Parent'), 388 nUri(parent.uri)), 389 (nUri(parent.uri), pUri( 390 'cdao:belongs_to_Edge_as_Parent'), nUri(edge_uri)), 391 ] 392 393 if hasattr(clade, 'confidence') and clade.confidence is not None: 394 confidence = rdflib.Literal( 395 clade.confidence, datatype='http://www.w3.org/2001/XMLSchema#decimal') 396 397 statements += [(nUri(clade.uri), 398 pUri('cdao:has_Support_Value'), confidence)] 399 400 if self.record_complete_ancestry and len(clade.ancestors) > 0: 401 statements += [(nUri(clade.uri), pUri('cdao:has_Ancestor'), nUri(ancestor)) 402 for ancestor in clade.ancestors] 403 404 if clade.branch_length is not None: 405 # add branch length 406 edge_ann_uri = 'edge_annotation%s' % str( 407 self.edge_counter).zfill(ZEROES) 408 409 branch_length = rdflib.Literal(clade.branch_length, datatype=rdflib.URIRef( 410 'http://www.w3.org/2001/XMLSchema#decimal')) 411 statements += [ 412 (nUri(edge_ann_uri), pUri('rdf:type'), 413 pUri('cdao:EdgeLength')), 414 (nUri(edge_uri), pUri('cdao:has_Annotation'), 415 nUri(edge_ann_uri)), 416 (nUri(edge_ann_uri), 417 pUri('cdao:has_Value'), branch_length), 418 ] 419 420 try: 421 edge_attributes = clade.edge_attributes 422 except AttributeError: 423 edge_attributes = [] 424 425 for predicate, obj in edge_attributes: 426 yield (nUri(edge_uri), predicate, obj) 427 428 for stmt in statements: 429 yield stmt 430 431 try: 432 clade_attributes = clade.attributes 433 except AttributeError: 434 clade_attributes = [] 435 436 for predicate, obj in clade_attributes: 437 yield (nUri(clade.uri), predicate, obj) 438 439 if not clade.is_terminal(): 440 for new_clade in clade.clades: 441 for stmt in self.process_clade(new_clade, parent=clade, root=False): 442 yield stmt
443