Package Bio :: Package Phylo :: Module CDAOIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.CDAOIO

  1  # Copyright (C) 2013 by Ben Morris (ben@bendmorris.com) 
  2  # Based on Bio.Nexus, copyright 2005-2008 by Frank Kauff & Cymon J. Cox 
  3  # and Bio.Phylo.Newick, copyright 2009 by Eric Talevich. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license. Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8   
  9  """I/O function wrappers for the RDF/CDAO file format. 
 10   
 11  This is an RDF format that conforms to the Comparative Data Analysis Ontology (CDAO). 
 12  See: http://www.evolutionaryontology.org/cdao 
 13   
 14  This module requires the librdf Python bindings (http://www.librdf.org) 
 15   
 16  The CDAOIO.Parser, in addition to parsing text files, can also parse directly 
 17  from a triple store that implements the Redland storage interface; similarly, 
 18  the CDAOIO.Writer can store triples in a triple store instead of serializing 
 19  them to a file. 
 20  """ 
 21   
 22  __docformat__ = "restructuredtext en" 
 23   
 24  from Bio._py3k import StringIO 
 25   
 26  from Bio.Phylo import CDAO 
 27  from ._cdao_owl import cdao_elements, cdao_namespaces, resolve_uri 
 28  import os 
29 30 31 -class CDAOError(Exception):
32 """Exception raised when CDAO object construction cannot continue.""" 33 pass
34 35 try: 36 import rdflib 37 rdfver = rdflib.__version__ 38 if rdfver[0] in ["1", "2"] or (rdfver in ["3.0.0", "3.1.0", "3.2.0"]): 39 raise CDAOError( 40 'Support for CDAO tree format requires RDFlib v3.2.1 or later.') 41 except ImportError: 42 raise CDAOError('Support for CDAO tree format requires RDFlib.') 43 44 RDF_NAMESPACES = { 45 'owl': 'http://www.w3.org/2002/07/owl#', 46 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 47 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 48 } 49 RDF_NAMESPACES.update(cdao_namespaces) 50 # pad node ids with zeroes until they're at least this length 51 ZEROES = 8
52 53 54 -def qUri(x):
55 return resolve_uri(x, namespaces=RDF_NAMESPACES)
56
57 58 -def format_label(x):
59 return x.replace('_', ' ')
60
61 62 # --------------------------------------------------------- 63 # Public API 64 65 -def parse(handle, **kwargs):
66 """Iterate over the trees in a CDAO file handle. 67 68 :returns: generator of Bio.Phylo.CDAO.Tree objects. 69 """ 70 return Parser(handle).parse(**kwargs)
71
72 73 -def write(trees, handle, plain=False, **kwargs):
74 """Write a trees in CDAO format to the given file handle. 75 76 :returns: number of trees written. 77 """ 78 return Writer(trees).write(handle, plain=plain, **kwargs)
79
80 81 # --------------------------------------------------------- 82 # Input 83 84 -class Parser(object):
85 """Parse a CDAO tree given a file handle.""" 86
87 - def __init__(self, handle=None):
88 self.handle = handle 89 self.graph = None 90 self.node_info = None 91 self.children = {} 92 self.rooted = False
93 94 @classmethod
95 - def from_string(cls, treetext):
96 handle = StringIO(treetext) 97 return cls(handle)
98
99 - def parse(self, **kwargs):
100 """Parse the text stream this object was initialized with.""" 101 self.parse_handle_to_graph(**kwargs) 102 return self.parse_graph()
103
104 - def parse_handle_to_graph(self, rooted=False, 105 parse_format='turtle', context=None, **kwargs):
106 """Parse self.handle into RDF model self.model.""" 107 108 if self.graph is None: 109 self.graph = rdflib.Graph() 110 graph = self.graph 111 112 for k, v in RDF_NAMESPACES.items(): 113 graph.bind(k, v) 114 115 self.rooted = rooted 116 117 if 'base_uri' in kwargs: 118 base_uri = kwargs['base_uri'] 119 else: 120 base_uri = "file://" + os.path.abspath(self.handle.name) 121 122 graph.parse(file=self.handle, publicID=base_uri, format=parse_format) 123 124 return self.parse_graph(graph, context=context)
125
126 - def parse_graph(self, graph=None, context=None):
127 """Generator that yields CDAO.Tree instances from an RDF model.""" 128 129 if graph is None: 130 graph = self.graph 131 132 # look up branch lengths/TUs for all nodes 133 self.get_node_info(graph, context=context) 134 135 for root_node in self.tree_roots: 136 clade = self.parse_children(root_node) 137 138 yield CDAO.Tree(root=clade, rooted=self.rooted)
139
140 - def new_clade(self, node):
141 """Returns a CDAO.Clade object for a given named node.""" 142 143 result = self.node_info[node] 144 145 kwargs = {} 146 if 'branch_length' in result: 147 kwargs['branch_length'] = result['branch_length'] 148 if 'label' in result: 149 kwargs['name'] = result['label'].replace('_', ' ') 150 if 'confidence' in result: 151 kwargs['confidence'] = result['confidence'] 152 153 clade = CDAO.Clade(**kwargs) 154 155 return clade
156
157 - def get_node_info(self, graph, context=None):
158 """Creates a dictionary containing information about all nodes in the tree.""" 159 160 self.node_info = {} 161 self.obj_info = {} 162 self.children = {} 163 self.nodes = set() 164 self.tree_roots = set() 165 166 assignments = { 167 qUri('cdao:has_Parent'): 'parent', 168 qUri('cdao:belongs_to_Edge_as_Child'): 'edge', 169 qUri('cdao:has_Annotation'): 'annotation', 170 qUri('cdao:has_Value'): 'value', 171 qUri('cdao:represents_TU'): 'tu', 172 qUri('rdfs:label'): 'label', 173 qUri('cdao:has_Support_Value'): 'confidence', 174 } 175 176 for s, v, o in graph: 177 # process each RDF triple in the graph sequentially 178 179 s, v, o = str(s), str(v), str(o) 180 181 if s not in self.obj_info: 182 self.obj_info[s] = {} 183 this = self.obj_info[s] 184 185 try: 186 # if the predicate is one we care about, store information for 187 # later 188 this[assignments[v]] = o 189 except KeyError: 190 pass 191 192 if v == qUri('rdf:type'): 193 if o in (qUri('cdao:AncestralNode'), qUri('cdao:TerminalNode')): 194 # this is a tree node; store it in set of all nodes 195 self.nodes.add(s) 196 if v == qUri('cdao:has_Root'): 197 # this is a tree; store its root in set of all tree roots 198 self.tree_roots.add(o) 199 200 for node in self.nodes: 201 # for each node, look up all information needed to create a 202 # CDAO.Clade 203 self.node_info[node] = {} 204 node_info = self.node_info[node] 205 206 obj = self.obj_info[node] 207 if 'edge' in obj: 208 # if this object points to an edge, we need a branch length from 209 # the annotation on that edge 210 edge = self.obj_info[obj['edge']] 211 if 'annotation' in edge: 212 annotation = self.obj_info[edge['annotation']] 213 if 'value' in annotation: 214 node_info['branch_length'] = float(annotation['value']) 215 216 if 'tu' in obj: 217 # if this object points to a TU, we need the label of that TU 218 tu = self.obj_info[obj['tu']] 219 if 'label' in tu: 220 node_info['label'] = tu['label'] 221 222 if 'parent' in obj: 223 # store this node as a child of its parent, if it has one, 224 # so that the tree can be traversed from parent to children 225 parent = obj['parent'] 226 if parent not in self.children: 227 self.children[parent] = [] 228 self.children[parent].append(node)
229
230 - def parse_children(self, node):
231 """Traverse the tree to create a nested clade structure. 232 233 Return a CDAO.Clade, and calls itself recursively for each child, 234 traversing the entire tree and creating a nested structure of CDAO.Clade 235 objects. 236 """ 237 238 clade = self.new_clade(node) 239 240 children = self.children[node] if node in self.children else [] 241 clade.clades = [ 242 self.parse_children(child_node) for child_node in children] 243 244 return clade
245
246 247 # --------------------------------------------------------- 248 # Output 249 250 -class Writer(object):
251 """Based on the writer in Bio.Nexus.Trees (str, to_string).""" 252 prefixes = RDF_NAMESPACES 253
254 - def __init__(self, trees):
255 self.trees = trees 256 257 self.node_counter = 0 258 self.edge_counter = 0 259 self.tu_counter = 0 260 self.tree_counter = 0
261
262 - def write(self, handle, tree_uri='', record_complete_ancestry=False, 263 rooted=False, **kwargs):
264 """Write this instance's trees to a file handle.""" 265 266 self.rooted = rooted 267 self.record_complete_ancestry = record_complete_ancestry 268 269 if tree_uri and not tree_uri.endswith('/'): 270 tree_uri += '/' 271 272 trees = self.trees 273 274 if tree_uri: 275 handle.write('@base <%s>\n' % tree_uri) 276 for k, v in self.prefixes.items(): 277 handle.write('@prefix %s: <%s> .\n' % (k, v)) 278 279 handle.write('<%s> a owl:Ontology .\n' % self.prefixes['cdao']) 280 281 for tree in trees: 282 self.tree_counter += 1 283 self.tree_uri = 'tree%s' 284 285 first_clade = tree.clade 286 statements = self.process_clade(first_clade, root=tree) 287 for stmt in statements: 288 self.add_stmt_to_handle(handle, stmt)
289
290 - def add_stmt_to_handle(self, handle, stmt):
291 # apply URI prefixes 292 stmt_strings = [] 293 for n, part in enumerate(stmt): 294 if isinstance(part, rdflib.URIRef): 295 node_uri = str(part) 296 changed = False 297 for prefix, uri in self.prefixes.items(): 298 if node_uri.startswith(uri): 299 node_uri = node_uri.replace(uri, '%s:' % prefix, 1) 300 if node_uri == 'rdf:type': 301 node_uri = 'a' 302 changed = True 303 if changed or ':' in node_uri: 304 stmt_strings.append(node_uri) 305 else: 306 stmt_strings.append('<%s>' % node_uri) 307 308 elif isinstance(part, rdflib.Literal): 309 stmt_strings.append(part.n3()) 310 311 else: 312 stmt_strings.append(str(part)) 313 314 handle.write('%s .\n' % ' '.join(stmt_strings))
315
316 - def process_clade(self, clade, parent=None, root=False):
317 """recursively generate triples describing a tree of clades""" 318 319 self.node_counter += 1 320 clade.uri = 'node%s' % str(self.node_counter).zfill(ZEROES) 321 if parent: 322 clade.ancestors = parent.ancestors + [parent.uri] 323 else: 324 clade.ancestors = [] 325 326 nUri = lambda s: rdflib.URIRef(s) 327 pUri = lambda s: rdflib.URIRef(qUri(s)) 328 tree_id = nUri('') 329 330 statements = [] 331 332 if root is not False: 333 # create a cdao:RootedTree with reference to the tree root 334 tree_type = pUri('cdao:RootedTree') if self.rooted else pUri( 335 'cdao:UnrootedTree') 336 337 statements += [ 338 (tree_id, pUri('rdf:type'), tree_type), 339 (tree_id, pUri('cdao:has_Root'), nUri(clade.uri)), 340 ] 341 342 try: 343 tree_attributes = root.attributes 344 except AttributeError: 345 tree_attributes = [] 346 347 for predicate, obj in tree_attributes: 348 statements.append((tree_id, predicate, obj)) 349 350 if clade.name: 351 # create TU 352 self.tu_counter += 1 353 tu_uri = 'tu%s' % str(self.tu_counter).zfill(ZEROES) 354 355 statements += [ 356 (nUri(tu_uri), pUri('rdf:type'), pUri('cdao:TU')), 357 (nUri(clade.uri), pUri( 358 'cdao:represents_TU'), nUri(tu_uri)), 359 (nUri(tu_uri), pUri('rdfs:label'), 360 rdflib.Literal(format_label(clade.name))), 361 ] 362 363 try: 364 tu_attributes = clade.tu_attributes 365 except AttributeError: 366 tu_attributes = [] 367 368 for predicate, obj in tu_attributes: 369 yield (nUri(tu_uri), predicate, obj) 370 371 # create this node 372 node_type = 'cdao:TerminalNode' if clade.is_terminal( 373 ) else 'cdao:AncestralNode' 374 statements += [ 375 (nUri(clade.uri), pUri('rdf:type'), pUri(node_type)), 376 (nUri(clade.uri), pUri( 377 'cdao:belongs_to_Tree'), tree_id), 378 ] 379 380 if parent is not None: 381 # create edge from the parent node to this node 382 self.edge_counter += 1 383 edge_uri = 'edge%s' % str(self.edge_counter).zfill(ZEROES) 384 385 statements += [ 386 (nUri(edge_uri), pUri('rdf:type'), pUri('cdao:DirectedEdge')), 387 (nUri(edge_uri), pUri( 388 'cdao:belongs_to_Tree'), tree_id), 389 (nUri(edge_uri), pUri('cdao:has_Parent_Node'), 390 nUri(parent.uri)), 391 (nUri(edge_uri), pUri('cdao:has_Child_Node'), 392 nUri(clade.uri)), 393 (nUri(clade.uri), pUri( 394 'cdao:belongs_to_Edge_as_Child'), nUri(edge_uri)), 395 (nUri(clade.uri), pUri('cdao:has_Parent'), 396 nUri(parent.uri)), 397 (nUri(parent.uri), pUri( 398 'cdao:belongs_to_Edge_as_Parent'), nUri(edge_uri)), 399 ] 400 401 if hasattr(clade, 'confidence') and clade.confidence is not None: 402 confidence = rdflib.Literal( 403 clade.confidence, datatype='http://www.w3.org/2001/XMLSchema#decimal') 404 405 statements += [(nUri(clade.uri), 406 pUri('cdao:has_Support_Value'), confidence)] 407 408 if self.record_complete_ancestry and len(clade.ancestors) > 0: 409 statements += [(nUri(clade.uri), pUri('cdao:has_Ancestor'), nUri(ancestor)) 410 for ancestor in clade.ancestors] 411 412 if clade.branch_length is not None: 413 # add branch length 414 edge_ann_uri = 'edge_annotation%s' % str( 415 self.edge_counter).zfill(ZEROES) 416 417 branch_length = rdflib.Literal(clade.branch_length, datatype=rdflib.URIRef( 418 'http://www.w3.org/2001/XMLSchema#decimal')) 419 statements += [ 420 (nUri(edge_ann_uri), pUri('rdf:type'), 421 pUri('cdao:EdgeLength')), 422 (nUri(edge_uri), pUri('cdao:has_Annotation'), 423 nUri(edge_ann_uri)), 424 (nUri(edge_ann_uri), 425 pUri('cdao:has_Value'), branch_length), 426 ] 427 428 try: 429 edge_attributes = clade.edge_attributes 430 except AttributeError: 431 edge_attributes = [] 432 433 for predicate, obj in edge_attributes: 434 yield (nUri(edge_uri), predicate, obj) 435 436 for stmt in statements: 437 yield stmt 438 439 try: 440 clade_attributes = clade.attributes 441 except AttributeError: 442 clade_attributes = [] 443 444 for predicate, obj in clade_attributes: 445 yield (nUri(clade.uri), predicate, obj) 446 447 if not clade.is_terminal(): 448 for new_clade in clade.clades: 449 for stmt in self.process_clade(new_clade, parent=clade, root=False): 450 yield stmt
451