# smartie.py: CCP4 logfile parsing classes and functions # Copyright (C) 2006-2007 Peter Briggs, Wanjuan Yang, CCLRC # # This code is distributed under the terms and conditions of the # CCP4 licence agreement as `Part 1' (Annex 2) software. # A copy of the CCP4 licence can be obtained by writing to the # CCP4 Secretary, Daresbury Laboratory, Warrington WA4 4AD, UK. # ######################################################################## # # smartie.py # ######################################################################### """smartie: CCP4 logfile parsing functions The smartie module provides a set of classes and methods for parsing logfiles from CCP4i and CCP4 programs. The central class is the 'logfile', which provides a basic DOM-like description of a logfile and its contents. Other classes provide descriptions of smaller chunks of logfile features (programs, tables, keytext data and CCP4i informational messages). The name 'smartie' reflects the module's origins as the intended driver for a 'smart logfile browser'. Some additional documentation material is also available in the file smartie_overview.html.""" __cvs_id__ = "$Id: smartie.py,v 1.7 2012/07/05 20:30:36 rmk65 Exp $" __version__ = "0.0.15" ####################################################################### # Import modules that this module depends on ####################################################################### import sys import os import re import copy import linecache import time ####################################################################### # Class definitions ####################################################################### # buffer # # A class to store sets of lines which can then # be passed to regular expression matching functions # class buffer: """Buffer object for parsing log files. The buffer object holds lines of text which are added a line at a time via the 'append' method.""" def __init__(self,maxsize=0): """Initialise a new buffer object. If 'maxsize' is greater than zero then it sets the maximum number of lines that the buffer will hold. If this number of lines is exceeded by an append operation, then the 'oldest' line is dropped from the buffer. If 'maxsize' is zero or negative then no upper limit is placed on the number of lines that the buffer will store.""" self.__contents = [] self.__maxsize = maxsize def __len__(self): """Builtin: return the number of lines stored in the buffer.""" return len(self.__contents) def append(self,line): """Append a line of text to the buffer. The line will have any trailing 'end of line' characters removed automatically upon storage.""" if self.__maxsize > 0 and len(self.__contents) > self.__maxsize: self.__contents = self.__contents[1:] # Remove trailing newline/end of line characters self.__contents.append(line.rstrip("\r\n")) def len(self): """Return the number of lines currently stored. Deprecated: use len(table) instead.""" return self.__len__() def line(self,n): """Return the n'th line of text from the buffer. The line will be returned without end-of-line characters.""" return self.__contents[n] def tail(self,n=10): """Return the 'tail' of the buffer. This returns the last n lines of text stored in the buffer, concatenated into a single string with end lines terminated by a newline character. If a number of lines is not specified then it defaults to the last 10 lines of text.""" # optimization: don't use contents(), len() and all these "helpers" return "\n".join(self.__contents[-n:]) nend = self.len() nstart = nend - n if nstart < 0: nstart = 0 return self.contents(nstart,nend) def getlines(self,n,m): """Return lines 'n' through to 'm' as a list. Return a set of lines starting from line index 'n' up to but not including line index 'm', as a list.""" return self.__contents[n:m] def contents(self,n,m): """Return lines 'n' through to 'm' as a string. Return the specified lines from the buffer concatenated into a single string with line ends terminated by a newline character.""" str = "" subset = self.getlines(n,m) for line in subset: str = str + line + "\n" return str.rstrip("\n") def all(self): """Return entire buffer contents as a string. All lines in the buffer will be concatenated into a single string with line ends terminated by a newline character.""" #optimization return "\n".join(self.__contents) str = "" for line in self.__contents: str = str + line + "\n" return str.rstrip("\n") def clear(self): """Clear the buffer of all content. Delete all lines currently stored in the buffer.""" self.__contents[0:self.len()] = [] return # # tablebuffer # # Buffer class specialised for handling tables # Subclasses buffer class tablebuffer(buffer): """Buffer object specialised for dealing with CCP4 tables. This class extends the 'buffer' class with additional data and methods specialised for CCP4 tables.""" def __init__(self,maxsize=0): """Initialise a new tablebuffer object.""" self.__hasTABLE = False self.__hasGRAPHS = False self.__ndoubledollar = 0 buffer.__init__(self,maxsize) def append(self,line): """Append a line of text to the tablebuffer. This overrides the append method of the parent class and performs additional checks on the line being added, to also identify specific features (such as '$$' symbols) that are part of a CCP4 table.""" # Check if line has "$TABLE" hasTABLE = re.compile(r"\$TABLE *:").search(line) if hasTABLE: # Check if the buffer already contains a # partial or complete table if self.__hasTABLE: # Dump the existing table self.clear() self.__hasTABLE = True # Check if line has "$(GRAPHS|SCATTER)" if self.__hasTABLE: buffer.append(self,line) hasGRAPHS = re.compile(r"\$(GRAPHS|SCATTER)").search(line) if hasGRAPHS: self.__hasGRAPHS = True # Check if line has "$$" if self.__hasGRAPHS: ndoubledollar = line.count("$$") if ndoubledollar > 0: self.__ndoubledollar = self.__ndoubledollar + ndoubledollar # Check if we have a complete table yet def complete(self): """Check if the buffer appears to contain a complete table. Returns 'True' if the buffer contains the following features, encountered in this order: 1. '$TABLE' token 2. '$GRAPH' or '$SCATTER' token 3. Four '$$' tokens In this case it is likely that the buffer contains a complete CCP4 table. If any of these elements are missing then the method returns 'False'.""" if self.__hasTABLE and self.__hasGRAPHS and self.__ndoubledollar == 4: return True else: return False def clear(self): """Clear the tablebuffer of all data. This overrides the 'clear' method of the parent class and also resets the flag data that is specific to the tablebuffer class.""" self.__hasTABLE = False self.__hasGRAPHS = False self.__ndoubledollar = 0 buffer.clear(self) # logfile # # Abstract description of a CCP4 logfile # class logfile: """Object describing a program logfile. A logfile object is populated and returned by the parselog() function. This takes a file name as a single compulsory argument; the optional 'progress' argument specifies a number of lines at which to report progress when parsing the file. A logfile object holds lists of 'programs', 'tables', 'keytext messages' and 'CCP4i information messages', plus a master list of 'fragments' (which can be any of the above). There are methods to allow access to each of these lists. There is also a list of CCP4 'summaries' that have been been found in the logfile. These are kept distinct from the logfile fragments above.""" def __init__(self,filename): """Initialise the logfile object.""" # Source file if os.path.isabs(filename): self.__filename = os.path.normpath(filename) else: # Construct an absolute path self.__filename = os.path.abspath( os.path.join(os.getcwd(),filename)) # CCP4i header and tail self.__isccp4i = False self.__ccp4i_header = [] self.__ccp4i_tail = [] # List of fragments, programs, tables, keytexts # and ccp4i_info self.__fragments = [] self.__programs = [] self.__tables = [] self.__keytexts = [] self.__ccp4i_info = [] self.__summaries = [] def __nonzero__(self): """Implement the nonzero built-in method. The logfile will test as True if at least one fragment is defined - otherwise it will test as False.""" if len(self.__fragments) > 0: return True return False def append_ccp4i_header(self,line): """Append a line of text to the CCP4i header.""" # FIXME should be internally accessible only? self.__ccp4i_header.append(line) self.__isccp4i = True def ccp4i_header(self): """Return the CCP4i header content.""" return self.__ccp4i_header def append_ccp4i_tail(self,line): """Append a line of text to the CCP4i tail.""" # FIXME should be internally accessible only? self.__ccp4i_tail.append(line) self.__isccp4i = True def ccp4i_tail(self): """Return the CCP4i tail content.""" return self.__ccp4i_tail def isccp4i(self): """Return True if the logfile appears to be from CCP4i.""" return self.__isccp4i def filename(self): """Return the filename of the source logfile.""" return self.__filename def newfragment(self): """Generate a new fragement and add to the logfile. Returns a new fragment object and calls addfragment to add it to the list of fragments for this logfile.""" newfragment = fragment() self.addfragment(newfragment) return newfragment def addfragment(self,fragment): """Add an existing fragment-like object to the logfile.""" self.__fragments.append(fragment) def nfragments(self): """Return the number of fragments.""" return len(self.__fragments) def fragment(self,i): """Return the i'th fragment in the logfile. A fragment can be a program, table, CCP4i message or keytext object. Note that i counts up from zero.""" try: return self.__fragments[i] except: raise def addprogram(self): """Add a new program object to the logfile.""" # FIXME should be internally accessible only? newprogram = program() self.__programs.append(newprogram) self.addfragment(newprogram) return newprogram def nprograms(self): """Return the number of program objects.""" return len(self.__programs) def program(self,i): """Return the i'th program object in the logfile. Note that i counts up starting from zero.""" try: return self.__programs[i] except: raise def addtable(self,thistable=False,tabletext=""): """Add a table object to the list of tables. If an existing table object is specified with the thistable argument then this is appended to the list. Otherwise a new table object is created. In that case, if tabletext is supplied then this is used to populate the table object; otherwise the new table object is empty.""" # FIXME should be internally accessible only? if thistable: # Table object supplied self.__tables.append(thistable) return thistable else: # Make a new table if tabletext: new_table = table(tabletext) else: new_table = table() self.__tables.append(new_table) return new_table def ntables(self): """Return the number of tables in the logfile.""" return len(self.__tables) def table(self,i): """Return the i'th table object in the logfile. This method is deprecated, use 'logfile.tables()[i]' instead. Note that i counts up starting from zero.""" try: return self.__tables[i] except: raise def findtable(self,title_pattern,index=0): """Fetch a table in the logfile by matching the title. This method is deprecated; use the 'tables' method instead. This method looks up a particular table in a list of table objects (argument 'table_list'), by finding the first table in the list which matches the supplied regular expression 'title_pattern'. If there is more than one matching table then the 'index' argument specifies which of the list of matching tables should be returned. If index is out of range (or there are no matching tables) then return 'None'. It calls the 'find_table_by_title' function.""" return find_table_by_title(self.__tables,title_pattern,index) def tables(self,select_title=""): """Return a list of tables in the logfile. If no 'select_title' is specifed then this returns the list of all the table objects stored in the logfile object. If 'select_title' is given then this is compiled as a regular expression pattern, and the method returns a list containing only those table objects for which the title matches the pattern. In either case if no table objects are found then an empty list is returned. This method calls the 'find_tables_by_title' function.""" if select_title == "": # Return everything return copy.copy(self.__tables) return find_tables_by_title(self.__tables,select_title) def addkeytext(self,thiskeytext=False,name="",junk_text="",message=""): """Add a keytext object to the list of keytexts. If an existing keytext object is supplied with the thiskeytext argument this is appended to the list. Otherwise a new keytext object is created and populated with the contents of the name, junk_text and message arguments.""" # FIXME should be internally accessible only? if thiskeytext: # Table object supplied self.__keytexts.append(thiskeytext) return thiskeytext else: # Make a new keytext new_keytext = keytext(name,junk_text,message) self.__keytexts.append(new_keytext) return new_keytext def nkeytexts(self): """Return the number of keytexts in the logfile.""" return len(self.__keytexts) def keytext(self,i): """Return the i'th keytext object in the logfile. Note that i counts up starting from zero.""" try: return self.__keytexts[i] except: raise def addccp4i_info(self): """Add another ccp4i_info object to the logfile. Creates a new ccp4i_info object and added to the list of fragments, and to the list of CCP4i information messages found in the logfile.""" # FIXME should be internally accessible only? # Make a new ccp4i_info object newccp4i_info = ccp4i_info() self.__ccp4i_info.append(newccp4i_info) self.addfragment(newccp4i_info) return newccp4i_info def nccp4i_info(self): """Return the number of ccp4i_info messages.""" return len(self.__ccp4i_info) def ccp4i_info(self,i): """Return the i'th ccp4i_info object in the logfile. Note that i counts up starting from zero.""" try: return self.__ccp4i_info[i] except: raise def addsummary(self,start_line=-1): """Add another summary object to the logfile. A new summary object is created and returned. The new object is also added to the list of summaries for the logfile.""" new_summary = summary(self.__filename,start_line) self.__summaries.append(new_summary) return new_summary def nsummaries(self): """Return the number of summaries found in the log.""" return len(self.__summaries) def summary(self,i): """Return the i'th summary object in the logfile. Note that i counts up starting from zero.""" try: return self.__summaries[i] except: raise def set_fragment_start(self,line_no): """Set the start line of the most recent fragment. The most recent fragment is the last fragment object in the list of fragments for this logfile. 'line_no' is the current line number in the source file. If the fragment has an 'nlines' attribute then this is taken to be the offset from the current line back to the start of the fragment. If nlines is not present then the fragment is taken to start after the end of the previous fragment. If there is no previous fragment then it is assumed to start from the first line of the file""" fragment = self.__fragments[-1] fragment.set_attribute("source_file",self.__filename) if fragment.has_attribute("nlines"): # Calculate the start of the fragment from the # current position offset = fragment.nlines fragment.set_startline(line_no-offset) else: if self.nfragments() > 1: # Assume that the fragment starts from here fragment.set_startline(line_no) else: # This is the first fragment fragment.set_startline(1) # Now deal with the previous fragment, # which may not have an end line set if self.nfragments() > 1: last_fragment = self.__fragments[-2] if last_fragment.get_endline() < 0: last_fragment.set_endline(fragment.get_startline()-1) def set_fragment_end(self,line_no): """Set the end line of the most recent fragment. The most recent fragment is the last fragment object in the list of fragments for this logfile. 'line_no' is the current line number in the source file. The supplied line number is always taken as the last line number of the fragment. This method will also check the start line number and will attempt to set it to a reasonable value if it is not set: either the first line after the end of the previous fragment, or the start of the file (if there is no previous fragment).""" if len(self.__fragments) == 0: # We're in a situation where there was no # first fragment # Let's make one now self.newfragment() fragment = self.__fragments[-1] if fragment.get_endline() > -1: # Don't reset the value if it's already set return fragment.set_attribute("source_file",self.__filename) fragment.set_endline(line_no) # Check if the start is also set if fragment.get_startline() < 1: if self.nfragments() > 1: # Assume that the fragment started from the # end of the previous fragment last_fragment = self.__fragments[-2] fragment.set_startline(last_fragment.get_endline()+1) else: # This is the first fragment fragment.set_startline(1) def fragment_to_program(self,i): """Convert the i'th fragment to a program. This method allows a fragment in the logfile to be recast as a program, and performs all the necessary book keeping operations such as updating the lists of fragment and program objects. On successful conversion the converted program object is returned. If the fragment is already a program then no action is taken.""" if self.fragment(i).isprogram(): return self.fragment(i) prog = copyfragment(self.fragment(i),program()) # Add the converted program fragment to the # list of programs # To do this we need to work out where it belongs if i == 0: # Fragment was the first in the list # Add to the start of the program list self.__programs.insert(0,prog) else: # Look for a fragment after this one # in the list which is also a program nextprog = None for j in range(i,self.nfragments()): if self.fragment(j).isprogram(): nextprog = self.fragment(j) break if not nextprog: # No programs found - append self.__programs.append(prog) else: # Locate this in the list of programs j = self.__programs.index(nextprog) self.__programs.insert(j,prog) # Remove the converted fragment self.__fragments.remove(self.fragment(i)) return prog # fragment # # Abstract description of a generic logfile fragment # class fragment: """Object describing a generic fragment of a logfile. The fragment object is intended to represent any 'top-level' fragment of a logfile, for example a program logfile or some output from a script that appears inbetween program logs. The fragment class is used as the base class for the program and ccp4i_info classes.""" def __init__(self): """Initialise a new fragment object.""" # Initialise a dictionary to store arbitrary # attributes taken from the program logfile self.__dict = {} # List of tables self.__tables = [] # List of keytexts self.__keytexts = [] # For fragment retrieval self.set_source_file("") self.set_startline(-1) self.set_endline(-1) # Flags self.__nonzero = False def __nonzero__(self): """Implement the __nonzero__ built-in method. The fragment is considered 'nonzero' once an attribute, table or keytext has been assigned to it.""" return self.__nonzero def __len__(self): """Implement the __len__ built-in method. The length of a fragment is the number of lines of text that it contains.""" nlines = self.get_endline() - self.get_startline() + 1 if nlines < 0: nlines = 0 return nlines def isfragment(self): """Return True if this represents a basic fragment.""" return True def isprogram(self): """Return True if this represents a program logfile.""" return False def isccp4i_info(self): """Return True if this is a CCP4i information fragment.""" return False def __setitem__(self,key,value): """Implements the functionality for program[key] = value Wrapper for the set_attribute method.""" self.set_attribute(key,value) def __getitem__(self,key): """Implements the functionality for value = fragment[key] Wrapper for the get_attribute method.""" return self.get_attribute(key) def __getattr__(self,key): """Implements the functionality for value = fragment.key Wrapper for the get_attribute method.""" return self.get_attribute(key) def get_attribute(self,key): """Return the value of a fragment attribute. The key is a string specifying a particular fragment attribute. If the attribute has been read from the file then its value is returned, otherwise a KeyError exception is raised.""" try: return self.__dict[key] except KeyError: raise AttributeError, "Unknown attribute '"+str(key)+"'" def has_attribute(self,key): """Check whether a fragment attribute has been set. The key is a string specifying a particular fragment attribute. If the attribute has been set then this method returns True, otherwise it returns False.""" return self.__dict.has_key(key) def attributes(self): """Return a list of all the fragment attributes. The list contains all the attributes that have been set for the fragment.""" return self.__dict.keys() def set_attribute(self,key,value): """Set the value of a fragment attribute. The key is a string specifying a particular fragment attribute which will be assigned the given value. If the attribute doesn't exist then it will be created, if it does then the current value will be overwritten by the new one.""" self.__dict[key] = value self.__nonzero = True def set_attributes_from_dictionary(self,dict): """Set the values of multiple fragment attributes. For each key in dictionary 'dict', the value of a fragment attribute with the same name as the key will be assigned the same value as that of the key.""" for key in dict: self.__dict[key] = dict[key] self.__nonzero = True def addtable(self,tabletext=""): """Add a new table object to the fragment. Create a new table object and add it to the list of tables associated with the fragment. If 'tabletext' is nonblank then the table object will be automatically populated from the text, if possible. This method returns the new table object.""" if tabletext: newtable = table(tabletext) else: newtable = table() self.__tables.append(newtable) self.__nonzero = True return newtable def ntables(self): """Return the number of tables found in the fragment.""" return len(self.__tables) def table(self,i): """Return the i'th table object. This method is deprecated, use 'fragment.tables()[i]' instead. fragment.table(i) returns the i'th table object associated with the fragment object. The methods of the table class can then be used to drill down into the contents of the table. Use the ntables method to get the total number of table objects associated with the fragment.""" try: return self.__tables[i] except: raise def findtable(self,title_pattern,index=0): """Fetch a table in the fragment by matching the title. This method is deprecated; use the 'tables' method instead. This method looks up a particular table in a list of table objects (argument 'table_list'), by finding the first table in the list which matches the supplied regular expression 'title_pattern'. If there is more than one matching table then the 'index' argument specifies which of the list of matching tables should be returned. If index is out of range (or there are no matching tables) then return 'None'. It calls the 'find_table_by_title' function.""" return find_table_by_title(self.__tables,title_pattern,index) def tables(self,select_title=""): """Return a list of tables in the fragment. If no 'select_title' is specifed then this returns the list of all the table objects stored in the fragment object. If 'select_title' is given then this is compiled as a regular expression pattern, and the method returns a list containing only those table objects for which the title matches the pattern. In either case if no table objects are found then an empty list is returned. This method calls the 'find_tables_by_title' function.""" if select_title == "": # Return everything return copy.copy(self.__tables) return find_tables_by_title(self.__tables,select_title) def addkeytext(self,name="",junk_text="",message=""): """Add a new keytext object to the fragment. Create a new keytext object and add it to the list of keytexts associated with the fragment. The values of the parameters 'name', 'junk_text' and 'message' will be used to initialise the new keytext object (one or more of these can be blank). This method returns the new keytext object.""" # FIXME should be internally accessible only? newkeytext = keytext(name,junk_text,message) self.__keytexts.append(newkeytext) self.__nonzero = True return newkeytext def nkeytexts(self): """Return the number of keytexts found in the logfile fragment. 'Keytexts' are warnings and messages issued by the ccperror/CCPERR functions within programs; see the loggraph format documentation for more information, e.g. http://www.ccp4.ac.uk/dist/html/loggraphformat.html""" return len(self.__keytexts) def keytext(self,i): """Return the i'th keytext object. For example: program.keytext(i) returns the i'th keytext object associated with the program object. The methods of the keytext class can then be used to drill down into the contents of the message. Use the nkeytexts method to get the total number of keytext objects associated with the program/fragment.""" try: return self.__keytexts[i] except: raise def set_startline(self,line_no): """Set the start line of the fragment in the source document.""" self.set_attribute("startline",line_no) def get_startline(self): """Get the start line of the fragment in the source document.""" return self.get_attribute("startline") def set_endline(self,line_no): """Set the end line of the fragment in the source document.""" self.set_attribute("endline",line_no) def get_endline(self): """Get the end line of the fragment in the source document.""" return self.get_attribute("endline") def set_source_file(self,source_file): """Set the source document for the fragment. The source document is specified as the name of the file that the fragment is part of.""" self.set_attribute("source_file",source_file) def get_source_file(self): """Get the source document for the fragment.""" return self.get_attribute("source_file") def retrieve(self): """Retrieve the text associated with the fragment. This uses the 'retrieve' method within the module.""" # Retrieve the information filen = self.get_source_file() start = self.get_startline() end = self.get_endline() return retrieve(filen,start,end) # # program # # Abstract description of the logfile for a single program # class program(fragment): """Object describing the log for a single program. program objects are instantiated and populated by parselog as part of the parsing process. The program object is intended to describe a fragment of logfile that corresponds to the run of a particular program, although in practice other types of logfile features (for example, 'interstitial' fragments i.e. bits of output inbetween program logs) are also assigned to program objects in the current version of smartie. A program object holds various attributes describing the logfile fragment in question, as well as a list of tables and keytext messages. A program object may also hold CCP4i information messages, however normally it will not hold this at the same time as actual program data. The attributes associated with the program object can be accessed using either of the syntaxes 'program['attribute']' or 'program.attribute'. For programs using the standard CCP4 banners, the following attributes may be defined: name: the name of the program from the CCP4 banner, or equivalent. version: the program version; for CCP4 programs, this is the version found in the program banner. For programs that don't explicitly give their own version number this will be the same as the CCP4 library version. date: the date string found in the CCP4 banner; it is typically the last date that the source code file was committed to CVS. It is not the date that the program was run on - for that, see the 'rundate' and 'runtime' attributes. ccp4version: the CCP4 library version as it appears in the program banner. Typically this includes only the major and minor version numbers, but not the patch level. user: the user id that appears in the CCP4 banner at runtime. runtime: the time of day that the program run started at as reported in the program banner. rundate: the date that the program run started at as reported in the program banner. termination_name: the program name as reported in the CCP4 termination message at the tail of the program log. termination_message: the message text displayed in the CCP4 termination message. usertime: the value of the 'user time' given at termination. systemtime: the value of the 'system time' given at termination. elapsedtime: the value of the 'elapsed time' given at termination. Note that not all these attributes may be defined, for example if the program fragment is an incomplete CCP4 log file or if the program is not a CCP4 program. Use the 'attributes' method to get a list of the defined attributes. In addition the program object also stores a list of the keyword input lines; this list can be retrieved directly using the 'keywords' method.""" def __init__(self): """Initialise a new program object.""" # Initialise the base class fragment.__init__(self) # Initialise program-specific flags and # attributes self.__isccp4 = False self.__termination = False # List of keyword lines self.__keywords = [] # Dictionary of logical name/filename pairs self.__logicalnames = {} def isprogram(self): """Return True if this represents a program logfile. Overrides the 'isprogram' method in the base class.""" return True def isfragment(self): """Return True if this represents a raw logfile fragment. Overrides the 'isfragment' method in the base class.""" return False def set_isccp4(self,isccp4): """Set whether the logfile fragment is from a CCP4 program or not. This method sets the value of the isccp4 flag to True if the logfile fragment is determined to be from a CCP4 program, and False if not. Use the 'isccp4' method to return the value of this flag.""" # Possibly this should be internally accessible only? self.__isccp4 = isccp4 def isccp4(self): """Check if the logfile fragment is from a CCP4 program. This returns True if the fragment of logfile appeared to be from a CCP4 program, and False otherwise.""" return self.__isccp4 def set_termination(self,termination): """Set whether the logfile has a termination message. This sets the value of the 'termination' flag to be True if a termination message was found, and False if not. Use the 'termination' method to return the value of this flag.""" # FIXME should be internally accessible only? self.__termination = termination def termination(self): """Check if the logfile fragment ends with a valid termination. This returns True if the fragment appeared to finish with a recognised termination message, False otherwise. Program fragments that do not end with a termination message may have terminated prematurely due to an error.""" return self.__termination def addkeyword(self,line): """Append a keyword input line to the program logfile. This appends a keyword input line (with any leading text removed) to the list of keyword lines stored in the program object.""" self.__keywords.append(line) def keywords(self): """Return the list of keyword lines. This method returns a list of the keyword input lines that have been stored for the program object. The lines are otherwise unprocessed. The lines are stored in the order that they were originally stored, and so should reflect the order that they appear in the logfile.""" return self.__keywords def addlogicalname(self,logical_name,filename): """Add a logical name/filename reference. This adds a logical name and the associated filename to the dictionary of files that were reported as being opened in the logfile. If the same logical name is added multiple times then only the last associated filename is kept.""" self.__logicalnames[logical_name] = filename def logicalnames(self): """Return a list of logical names associated with the program. The name of the file associated with a logical name can be retrieved using the 'logicalnamefile' method.""" return self.__logicalnames.keys() def logicalnamefile(self,logical_name): """Return the filename associated with a logical name. Given a logical name, return the associated filename. If the logical name isn't found then a KeyError exception is raised.""" try: return self.__logicalnames[logical_name] except KeyError: raise KeyError, "Logical name '"+str(logical_name)+"' not found" # # table # # Abstract description of a CCP4 formatted logfile table # class table: """Object describing a CCP4 logfile table The table class represents the various components of a table as output in CCP4 program logfiles. These tables are formatted in a standard way that enables the data that they contain to be displayed by the (x|j)loggraph programs. For a description of the loggraph format see the loggraph format documentation, e.g. http://www.ccp4.ac.uk/dist/html/loggraphformat.html A table consists of a number of columns of data, and a number of graphs which are defined as being a subset of these columns. Within smartie the table_column class represents an individual column, and the table_graph class represents an individual graph. A table object can be populated when it is created, by supplying it with text containing a CCP4-formatted table (typically, a fragment of logfile text). Alternatively an 'empty' table can be instantiated and then populated using the methods of the objects. The contents of the table can be output in the correct CCP4 format using the 'show' and 'jloggraph' methods.""" # Initialise the table object def __init__(self,tabletext=""): """Create a new table object. If tabletext contains the text of an existing CCP4-formatted table then the table object will attempt to parse the table and populate itself using the supplied data. If 'tabletext' cannot be interpreted as a table then the table object will be 'empty' and will contain no data. In this case, if 'tabletext' consists of a single line with no trailing newline then the table object title will be set to 'tabletext' automatically.""" # Table attributes self.__title = "" self.__type = "GRAPHS" # Default to GRAPHS self.__graphs = "" self.__columns = "" self.__text = "" self.__data = "" # Derived data self.__graph_list = [] self.__column_list = [] # Indicate the the object has been populated self.__table_parse_error = False self.__nonzero = False # The "raw" table data from the log file self.__rawtable = "" # Attempt to populate the table if tabletext: self.__rawtable = tabletext if not self.__buildtable(tabletext): # Failed to extract table # If it could be a title then use this # instead if str(tabletext).count("\n") == 0: self.settitle(tabletext) return def __nonzero__(self): """Builtin: provides the True/False test. A table object is nonzero if data has been loaded into it either at instantiation or subsequently using any of its methods.""" return self.__nonzero def __str__(self): """Builtin: return the table title""" if self.__nonzero: return self.__title return "" def __buildtable(self,tabletext): """Internal: populates the table object from an existing formatted table. 'tabletext' should be a block of text containing a CCP4 formatted table. This text can also contain extra leading or trailing text which is not part of the table, and this will be ignored. __buildtable extracts the various components of data from the supplied table text and populates the table object appropriately.""" # Set up the table object by parsing the # the supplied text tabledata = patternmatch().isccp4table(tabletext) if not tabledata: # No match # The regular expression failed to process the table self.__table_parse_error = True return False # Populate the table object self.settitle(tabledata["title"]) self.settype(tabledata["type"]) self.setgraphs(tabledata["graphs"]) self.setcolumns(tabledata["columns"]) self.settext(tabledata["text"]) self.setdata(tabledata["data"]) self.__nonzero = True return True def __populate_columns(self): """Internal: populates the table_column objects. This method processes the raw data in the body of the loggraph text and extracts white-space delimited data items, which are then assigned to individual columns. Where possible data are stored using an appropriate type, either integer, float or string.""" # Parse the raw data and populate the table_column # objects for this table i = 0 for item in self.__data.split(): self.table_column(i).append(item) i += 1 if i == self.ncolumns(): i = 0 # If there are enough items then i should be # zero at the end if i != 0: # This error could be due to two data items # no longer being separated by whitespace print "Unable to parse table - too many data items (or not enough)?" print "Table title: \""+str(self.title())+"\"" print "Number of columns : "+str(self.ncolumns()) print "Number of data items: "+str(len(self.__data.split())) self.__table_parse_error = True def parse_error(self): """Check if the supplied table was parsed correctly. If there was a problem parsing the raw table text (for example if the table was incomplete or misformatted) then parse_error() will return True, otherwise it will be False.""" # Check the table_parse_error flag return self.__table_parse_error def setrawtable(self,rawtable): """Store the 'raw' table text from the original logfile. The raw table data is the original text (for example, the fragment of log file text) supplied to the object to populate itself from.""" # Store the "raw" table data self.__rawtable = rawtable def rawtable(self): """Return the 'raw' table text taken from the logfile. This returns any original 'raw' text of the table that was used to populate the table object. If the table object wasn't populated from a text fragment then this will return an empty string. The 'loggraph' and 'jloggraph' methods are recommended over the 'rawtable' method as a way to return the table data formatted with loggraph tags.""" return self.__rawtable def settitle(self,title): """Store the table title. The table title is an arbitrary string of text that is intended to describe briefly the nature of the data presented in the table.""" self.__title = title self.__nonzero = True def title(self): """Return the table title stored in the object.""" return self.__title def settype(self,graphtype): """Store the table graph type. This is currently one of two possible loggraph keywords, either GRAPHS or SCATTER. The keyword is an indication to plotting software of how the data should be displayed: GRAPHS: line graphs, with data points joined by lines SCATTER: scatter plots, with data plotted as points. Raises a ValueError if the graphtype is not recognised.""" if "GRAPH" in str(graphtype): self.__type = "GRAPHS" elif "SCATTER" in str(graphtype): self.__type = "SCATTER" else: # Unknown type of graph - raise an exception raise ValueError, "Unknown graph type: "+graphtype+"\n"+\ "Must be one of 'GRAPHS' or 'SCATTER'" self.__nonzero = True def type(self): """Return the table graph type. See the 'settype' method for the possible values and their associated meanings for the table graph type.""" return self.__type def nrows(self): """Return the number of complete rows in the table. Returns the length of the shortest column of data stored in the table.""" if self.ncolumns() == 0: return 0 nrows = self.table_column(0).nrows() for i in range(1,self.ncolumns()): nrows = min(self.table_column(i).nrows(),nrows) return nrows def setgraphs(self,graphs): """Store the graph definitions in the table in 'raw' format. Within a CCP4-formatted table, one or more graphs are be defined using simple strings. Generally the descriptions take the form: :graph1 name:graphtype:column_list: :graph2 name:graphtype:column_list: ... (The graph definitions can be separated using whitespace, not necessarily newlines). The 'setgraphs' method takes an arbitrary number of graph definition strings of the above form and extracts from each the data, namely: the graph name (i.e. title), the type (normally either GRAPH or SCATTER) and a list of column numbers in the table.""" self.__graphs = graphs # Create table_graph objects rgraph = re.compile(r":([^:]+):([^:]+):([^:]+):") for graph in rgraph.findall(graphs): new_graph = self.addgraph(graph[0]) new_graph.setscaling(graph[1]) new_graph.setcolumns(graph[2]) self.__nonzero = True def graphs(self): """Return the graph titles and descriptions. This method returns the 'raw' string containing the graph definitions for the table, which were originally supplied via the 'setgraphs' method. Of itself this data is probably not very useful.""" return self.__graphs def setcolumns(self,columns): """Create new columns in the table from the 'raw' data. Within a CCP4-formatted table, titles of columns are supplied as a string of white-space delimited 'tokens' (the tokens are the titles). For example: Resln_Range 1/resol^2 Nref Nz1 Nz2 Nz3 ... This string is supplied to the setcolumns method as the 'columns' argument. setcolumns then extracts the individual column titles and for each makes a new (empty) table_column object. If table values have previously been stored in the table object (via the 'setdata' method) then setcolumns will also attempt to populate the columns from this data.""" # Store the column titles ("raw" format) # This is a list of white-space separated strings self.__columns = columns # Create table_column objects for col in columns.split(): self.addcolumn(col) # Attempt to populate the column objects if self.__data: self.__populate_columns() self.__nonzero = True def columns(self): """Return the original column titles text string. This method returns the 'raw' string that was supplied via the 'setcolumns' method, i.e. a single string with whitespace delimited column titles. Of itself this data is probably not very useful.""" return self.__columns def settext(self,text): """Store the arbitrary text from the table. Within a CCP4-formatted table there is space for a 'arbitrary text' (see the table format documentation for more details on this). This text is not used when plotting graphs but is included when the data in the table is written back out in CCP4 $TABLE format.""" self.__text = text self.__nonzero = True def text(self): """Return the arbitrary text from the table. This returns any arbitrary text associated with the table header that was previously stored using the 'settext' method.""" return self.__text def setdata(self,data): """Store the raw tabulated data for the table. The body of a CCP4-formatted table contains tabulated data items corresponding to columns associated with the column titles. The table body consists of a sequence of white-space separated data items (typically numbers but could be other data). The table body data is supplied to this method as a single text string via the 'data' argument, and is stored as is in the table object. If table columns have also previously been defined then this method will further attempt to populate the columns with the data from the table body.""" # Store the data from the table ("raw" format) # This is a list of whitespace separated data items self.__data = data # Attempt to populate the column objects if self.ncolumns() > 0: self.__populate_columns() self.__nonzero = True def data(self): """Return the 'raw' data from the table body. This method returns the 'raw' table body text as supplied originally via the 'setdata' method. Of itself this data is probably not very useful.""" return self.__data def ncolumns(self): """Return the number of columns in the table. This method returns the number of table_column objects associated with the table.""" return len(self.__column_list) def addcolumn(self,title=""): """Add a new column to the table. This method adds a new 'table_column' object to the table, to represent a column of data. Optionally the name of the column can be supplied via the 'title' argument. The table_column is otherwise unpopulated. The new table_column is returned by this method.""" new_column = table_column() self.__column_list.append(new_column) if title: new_column.settitle(title) return new_column def list_columns(self): """Return a list of the column names defined in the graph.""" columns = [] for icol in range(0,self.ncolumns()): columns.append(self.table_column(icol).title()) return columns def add_data(self,rowdata): """Add a row of values to the table. 'rowdata' is a dictionary which specifies column names as keys defining the values to be appended. For example, if the table has columns called 'X', 'Y' and 'Z', then rowdata might be defined as: rowdata = { 'X': 0.0, 'Y': 0.776, 'Z': 878 } 'Null' values (i.e. '*' character) will be added to columns not named to keep the table data well-formed. For example: rowdata = { 'X': 0.0, 'Z': 878 } will assign the expected data to the 'X' and 'Z'columns, while assigning the '*' character to the 'Y' column. """ if len(rowdata.keys()) == 0: # No columns were specified return for colnam in rowdata.keys(): # Check the the column is actually defined in # in the table try: self.list_columns().index(colnam) except ValueError: # The column name wasn't found raise ValueError, "Column "+str(colnam)+\ " is not defined in the table" for icol in range(0,self.ncolumns()): # Look up whether the column has an # explicit value assigned colnam = self.table_column(icol).title() if rowdata.has_key(colnam): self.table_column(icol).append(rowdata[colnam]) else: # Assign a null value self.table_column(icol).append("*") def definegraph(self,title,columns,scaling=""): """Add a new graph definition to the table. This provides an interface to adding new graph definitions to an existing table. title: title for the graph. columns: a list of column names. The first column will the be the X-axis, others will be the Y-values. scaling: (optional) the scaling definition. Possible scaling strings are: 'A': fully automatic (axes limits are automatically determined when the graph is rendered (this is the default) 'N': 'nought', axes limits start at zero 'XMIN|XMAXxYMIN|YMAX': limits xmin,xmax and ymin,ymax. Raises a ValueError if insufficient number of columns are specified, or if a specified column name doesn't appear in the table. """ # Check that there are at least two columns if len(columns) < 2: raise ValueError, "Graph definition needs at least two columns" # Build the graph description i.e. list of comma-separated # column numbers (this is the loggraph format) graph_desc = "" for colnam in columns: found = False for icol in range(0,self.ncolumns()): if self.table_column(icol).title() == colnam: graph_desc = graph_desc+","+str(icol+1) found = True break if not found: # The specified column wasn't located # Raise an exception raise ValueError, "Column "+str(colnam)+" not found in table" # Built the list - strip any leading commas graph_desc = graph_desc.strip(",") # Add a 'blank' table_graph new_graph = self.addgraph(title) # Scaling type new_graph.setscaling('A') if scaling != "": new_graph.setscaling(scaling) new_graph.setcolumns(graph_desc) return new_graph def table_column(self,i): """Return the i'th column associated with the table. This returns the i'th table_column object in the table. Note that i counts from zero. Generally applications that want to examine the data stored in a table are better off using the 'col' method of the table class rather than the 'table_column' method. The 'col' method allows data to be retrieved by column name, and returns the column of data as a Python list.""" try: return self.__column_list[i] except: raise def col(self,name): """Return the data in the column identified by 'name'. This method returns the data in the table column identified by 'name' (for example, 'Rfree'), as a list of values. (This is a copy of the list of values in the table_column object representing the column.) If the named column isn't found then a LookupError exception is raised.""" # Identify the column corresponding to the # supplied name and return a copy of the data for i in range(0,self.ncolumns()): if self.table_column(i).title() == name: return copy.copy(self.table_column(i).data()) # Column not found raise LookupError, "Column called '"+str(name)+"' not found" def ngraphs(self): """Return the number of graphs defined in the table. This method returns the number of 'table_graph' objects associated with the table.""" return len(self.__graph_list) def addgraph(self,title=""): """Add a new graph object to the table. This method adds a new 'table_graph' object to the table. Optionally the name of the new graph can be supplied using the 'title' argument. All other graph attributes are unset by default.""" new_graph = table_graph() self.__graph_list.append(new_graph) new_graph.set_parent_table(self) if title: new_graph.settitle(title) return new_graph def table_graph(self,i): """Return the i'th graph object. This method returns the i'th table_graph object associated with the table. (Note that i starts from zero.)""" try: return self.__graph_list[i] except: raise def jloggraph(self,codebase="",width=400,height=300): """Return a jloggraph-formatted table. This method returns the text for CCP4-formatted table from this object which includes the HTML tags required for the jloggraph Java applet. The codebase argument should specify the full path for the JLogGraph.class and JLogCanvas.class files required to run the applet (typically this is $CCP4/bin/).""" # Wraps the show method jloggraph = "For inline graphs use a Java browser" return jloggraph def loggraph(self,pad_columns=True): """Return a loggraph-formatted table. The loggraph method generates the text of the table based on the data stored in the object, with the correct tags defining the columns and graphs and which should be viewable in (x)loggraph. For information on the 'pad_columns' option, see the 'show' method (the setting here is passed directly to 'show'). To generate jloggraph-formatted tables use the jloggraph method.""" return self.show(loggraph=True,html=True,pad_columns=pad_columns) def show(self,loggraph=False,html=False,pad_columns=True): """Return the text of a CCP4-formatted table. The show method generates the text of the table based on the data stored in the object. If the 'loggraph' argument is specified as 'True' then the table includes the correct tags defining the columns and graphs and which should be viewable in (x)loggraph. If the 'html' argument is specified then special HTML characters in the titles are escaped. If 'pad_columns' is True then columns will be padded with spaces in order to make them line up nicely. If padding is not required then set it to False.""" tabletext = "" # Preamble for loggraph if loggraph: table_title = self.title() if html: table_title = escape_xml_characters(table_title) tabletext = tabletext+"$TABLE: "+table_title+":\n$"+self.type()+"\n" # Graph descriptions for i in range(0,self.ngraphs()): graph = self.table_graph(i) graph_title = graph.title() if html: graph_title = escape_xml_characters(graph_title) tabletext = tabletext+" :"+ \ graph_title+":"+ \ graph.scaling()+":" for col in graph.columns(): tabletext = tabletext+str(col+1)+"," tabletext = tabletext.rstrip(",") tabletext = tabletext+":\n" tabletext = tabletext+"$$\n" # Columns and rows ncolumns = self.ncolumns() if ncolumns > 0: nrows = len(self.table_column(0)) else: nrows = 0 # Determine field widths for printing field_width = [] if pad_columns: for i in range(0,ncolumns): max_width = len(self.table_column(i).title()) for item in self.table_column(i).data(): if len(str(item)) > max_width: max_width = len(str(item)) if max_width >= len(self.table_column(i).title()): # Put in an extra space again max_width = max_width+1 field_width.append(max_width) else: for i in range(0,ncolumns): field_width.append(0) # Column titles for i in range(0,ncolumns): title = self.table_column(i).title() while len(title) < field_width[i]: title = " "+title tabletext = tabletext+" "+title # Arbitrary text in loggraph format if loggraph: tabletext = tabletext+" $$" if self.text(): tabletext = tabletext+self.text() tabletext = tabletext+" $$\n" else: tabletext = tabletext+"\n\n" # The columns of data for i in range(0,nrows): for j in range(0,ncolumns): item = self.table_column(j)[i] while len(str(item)) < field_width[j]: item = " "+str(item) tabletext = tabletext+" "+str(item) tabletext = tabletext+"\n" # End of table if loggraph: tabletext = tabletext+"$$" return tabletext def html(self,border=2): """Return the text of a table with HTML formatting. This method returns the body of the table (column titles and column data) marked up as a HTML table. The width of the table can be controlled by setting the 'border' argument. Any HTML special characters (<, > and &) in the column titles or data items are automatically converted to the correct form for HTML.""" tabletext = "\n" # Columns and rows ncolumns = self.ncolumns() if ncolumns > 0: nrows = len(self.table_column(0)) else: nrows = 0 # Column titles tabletext = tabletext+"\n" for i in range(0,ncolumns): title = self.table_column(i).title() tabletext = tabletext+" \n" tabletext = tabletext+"\n" # The columns of data for i in range(0,nrows): tabletext = tabletext+"\n" for j in range(0,ncolumns): item = self.table_column(j)[i] tabletext = tabletext+" \n" tabletext = tabletext+"\n" # End of table tabletext = tabletext+"
"+ \ str(escape_xml_characters(title))+ \ "
"+ \ str(escape_xml_characters(item))+ \ "
" return tabletext # # table_graph # # Abstract description of a graph in a CCP4 logfile table # class table_graph: """Object describing a graph in a CCP4 logfile table. Tables in logfiles can contain any number of 'graphs', which are represented within smartie by table_graph objects. A graph is defined by a title, a scaling type, and a collection of table_columns storing columns of data.""" # Initialise the table_graph object def __init__(self,title="",scaling="",column_list=[]): """Create a new table_graph object. The 'title' argument is a string containing the title for the graph. 'scaling' is a string describing how the graph should be displayed within the (x|j)loggraph program. 'column_list' is a list of integers corresponding to the columns in the table that holds the graph. The first column in the list will form the 'x' axis of the graph when displayed, the others will be displayed on the 'y' axis.""" self.__title = title self.__column_list = column_list self.__scaling = scaling if self.__title: self.__nonzero = True else: self.__nonzero = False # Store a reference to the parent table self.__parent_table = None def __nonzero__(self): return self.__nonzero def settitle(self,title): """Store the title of the graph.""" self.__title = title def title(self): """Return the title of the graph.""" return self.__title def set_parent_table(self,table): """Store a reference to the parent table object.""" self.__parent_table = table def graphcols(self): """Return a list of the column names in the graph.""" columns = [] table = self.__parent_table for col in self.__column_list: columns.append(table.table_column(col).title()) return columns def setscaling(self,scaling): """Store the scaling description. This is a string which should take one of three possible forms, and which is an instruction to the display program on how to scale the graph data for display. 'A' is 'fully automatic' scaling (the display program determines the scaling itself for both axes). 'N' (for 'nought') is automatic y coordinate scaling, where the lowest limit on the y axis is 0s. 'XMIN|XMAXxYMIN|YMAX' (where XMIN, XMAX and YMIN, YMAX are numbers) specifies the exact limits of both axes.""" self.__scaling = scaling def scaling(self): """Return the scaling description.""" return self.__scaling def setcolumns(self,columns): """Set the table_columns associated with the graph. The columns are specified as a string of the form e.g. '1,2,4,5'. Note that the column numbers are adjusted downwards by 1 to map onto Python numbering (which starts at zero).""" self.__column_list = [] for i in columns.split(","): if str(i).strip().isdigit(): self.__column_list.append(int(i)-1) def columns(self): """Return the list of columns associated with the graph. This is a list of integers corresponding to the columns in the table.""" return self.__column_list # # table_column # # Abstract description of a column in a CCP4 logfile table # class table_column: """Object describing a column in a CCP4i logfile table""" def __init__(self,title=""): """Initialise the table_column object.""" self.__title = title self.__data = [] if self.__title: self.__nonzero = True else: self.__nonzero = False def __nonzero__(self): """Returns True if the column contains data, False otherwise.""" return self.__nonzero def __len__(self): """Implements len(table_column).""" return len(self.__data) def __getitem__(self,key): """Implement table_column[i] to return the i'th data value.""" return self.__data[key] def settitle(self,title): """Set the title of the column.""" self.__title = title def title(self): """Return the title of the column.""" return self.__title def append(self,item): """Append a data value to the end of the column. The value will be stored as integer, float or string as appropriate.""" try: # Is it a float? value = float(item) # But, could it be an integer? # Try a horrible test if float(int(item)) == value: # It's actually an integer value = int(item) except ValueError: # Not a numerical value - store as a string value = item # Add the data item as the correct type self.__data.append(value) def data(self): """Return the list of data values in the column.""" return self.__data def nrows(self): """Return the number of rows in the column.""" return len(self.__data) # # keytext # # Abstract description of a CCP4 formatted keytext message # class keytext: """Object describing a keytext message in a CCP4 logfile""" # Initialise the keytext object def __init__(self,name="",junk_text="",message=""): self.setname(name) self.setjunk_text(junk_text) self.setmessage(message) def setname(self,name): # Set the name attribute self.__name = str(name).strip() def name(self): # Return the name attribute return self.__name def setjunk_text(self,junk_text): # Set the junk_text attribute self.__junk_text = str(junk_text).strip() def junk_text(self): # Return the junk_text attribute return self.__junk_text def setmessage(self,message): # Set the message attribute self.__message = str(message).strip() def message(self): # Return the message attribue return self.__message # # ccp4i_info # # Abstract description of a CCP4i information message # class ccp4i_info(fragment): """Object describing a CCP4i information message in a CCP4 logfile. The ccp4i_info class has the following attributes: 'message': the text of the CCP4i information message.""" # Initialise the ccp4i_info object def __init__(self): # Initialise the base class fragment.__init__(self) # Initialise program-specific flags and # attributes self.set_attribute("message","") def isccp4i_info(self): return True def isfragment(self): return False # # summary # # Abstract description of a CCP4 "summary" block # class summary: """Object describing a summary block in a CCP4 logfile. The summary object holds information about the location of a block of text in a logfile. Normally this text would be a summary block from a CCP4 logfile, which is identified as starting with the text '' and terminating with the text ''. In practice, the summary object has three attributes: the name of a source file, and the start and end line numbers of the block of text within that file. The actual text is not stored. It can be fetched using the 'retrieve' method, in which case it is read directly from the file and returned.""" # Initialise the keytext object def __init__(self,source_file,start_line=-1): self.__source_file = source_file if start_line > 0: self.__start_line = start_line else: self.__start_line = -1 self.__end_line = -1 def set_start(self,start_line): """Set the start line for the summary block.""" self.__start_line = start_line def set_end(self,end_line): """Set the end line for the summary block.""" self.__end_line = end_line def start(self): """Return the start line for the summary block.""" return self.__start_line def end(self): """Return the end line for the summary block.""" return self.__end_line def iscomplete(self): """Check whether the summary block is complete. Returns True if the start and end line numbers are valid and consistent, and False otherwise.""" if self.__start_line < 0: return False if self.__end_line < self.__start_line: return False return True def retrieve(self): """Return the text within the summary block.""" if not self.iscomplete(): return "" return retrieve(self.__source_file, self.__start_line, self.__end_line) class patternmatch: """Object holding regular expressions for logfile features. The patternmatch object provides a set of methods that can match various features that might be found in CCP4 logfiles, and logfiles from other programs. These are: isccp4banner: check for CCP4 program banner isccp4termination: check for CCP4 program termination message isshelxbanner: check for SHELX program banner isshelxtermination: check for SHELX program termination isccp4keytext: check for CCP4 keytext messages isccp4table: check for CCP4 table isccp4iheader: check for CCP4i logfile header line isccp4itail: check for CCP4i logfile tail line isccp4iinformation: check for CCP4i information block It also provides methods to match single lines: isdataline: check if line contains CCP4 keyword input isfileopen: check if line contains CCP4 file opening information issummary_begin: check if line contains the start of a summary issummary_end: check if the line contains the end of a summary In each case, the method returns False if there is no match, and a dictionary of data items if there is a match. The data items are dependent on the type of pattern that is matched - see the information for the relevant method for descriptions.""" def __init__(self): # Initialise # Create a dictionary to hold the regular expressions self.__patterns = dict() def compile(self,name,pattern): """Returns a compiled regular expression from the pattern. This method returns a compiled regular expression associated with 'name', based on the supplied 'pattern'. If the name already has a compiled expression then that is returned, otherwise the compile method compiles and stores it before returning it.""" try: return self.get_pattern(name) except KeyError: return self.store_pattern(name,re.compile(pattern)) def has_pattern(self,name): """Returns True if there is a pattern associated 'name'.""" return self.__patterns.has_key(name) def store_pattern(self,name,cpattern): """Store a compiled regular expression associated with 'name'. 'cpattern' is a compiled regular expression which will be associated with 'name'. The expression can be retrieved using the 'get_pattern'.""" # Store the compiled regular expression in "pattern" # with the key "name" if not self.has_pattern(name): self.__patterns[name] = cpattern return cpattern # Raise an exception if a pattern has already been # stored with the same name raise KeyError def get_pattern(self,name): """Fetch a compiled regular expression associated with 'name'.""" return self.__patterns[name] def isccp4banner(self,text): """Regular expression match to CCP4 program banner. Given a block of text, attempts to match it against regular expressions for a CCP4 program banner. Returns False if the match fails, otherwise returns a dictionary object populated with attributes derived from the supplied text. See the isccp4banner_standard and isccp4banner_phaser functions for descriptions of the attributes that are extracted.""" # Try standard CCP4 banner result = self.isccp4banner_standard(text) if not result: # Try Phaser-style CCP4 banner result = self.isccp4banner_phaser(text) if not result: # Try old-style CCP4 banner result = self.isccp4banner_old(text) return result # Match CCP4 program termination def isccp4termination(self,text): """Regular expression match to CCP4 program termination. Given a block of text, attempts to match it against regular expressions for a CCP4 program termination. Returns False if the match fails, otherwise returns a dictionary object populated with attributes derived from the supplied text. See the isccp4termination_standard and isccp4termination_phaser functions for descriptions of the attributes that are extracted.""" # Try standard CCP4 termination result = self.isccp4termination_standard(text) if not result: # Try Phaser-style CCP4 termination result = self.isccp4termination_phaser(text) return result # Match standard CCP4 program banner def isccp4banner_standard(self,text): """Test if text matches a standard CCP4 program banner. If the match fails then return False; if it succeeds then return a dictionary with the following keys: name: the name of the program from the CCP4 banner. version: the program version; for CCP4 programs, this is the version found in the program banner. For programs that don't explicitly give their own version number this will be the same as the CCP4 library version. date: the date string found in the CCP4 banner; it is typically the last date that the source code file was committed to CVS. (It is not the date that the program was run on - for that, use 'rundate' and 'runtime'). ccp4version: the CCP4 library version as it appears in the program banner. Typically this includes only the major and minor version numbers, but not the patch level. user: the user id that appears in the CCP4 banner at runtime. runtime: the time of day that the program run started at as reported in the program banner. rundate: the date that the program run started at as reported in the program banner.""" # # Current banner looks like: # ############################################################### # ############################################################### # ############################################################### # ### CCP4 5.99: Refmac_5.2.0019 version 5.2.0019 : 04/08/05## # ############################################################### # User: pjx Run date: 25/10/2005 Run time: 15:19:23 # # There is also an intermediate version between 4.0 and later: # 1############################################################### # ############################################################### # ############################################################### # ### CCP4 4.1: OASIS version 4.1 : 12/02/01## # ############################################################### # User: pjx Run date: 14/ 5/01 Run time:15:24:36 # if "### CCP" not in text: return dict() banner = self.compile("isccp4banner_standard",r"(?: |1)#{63,63}\n #{63,63}\n #{63,63}\n ### CCP4 ([0-9.]+[-A-Za-z]*): ([A-Za-z0-9_().]+) *version ([0-9.]+[a-z]*) *: ([0-9 /]+)##\n #{63,63}\n User: *([A-Za-z0-9: /]+) *Run date: ([0-9 /]+) Run time: ?([0-9:]+) ?").search(text) #banner = rbanner.search(text) result = dict() if banner: result["banner_text"] = banner.group(0) result["ccp4version"] = banner.group(1) result["name"] = banner.group(2) result["version"] = banner.group(3) result["date"] = banner.group(4) result["user"] = banner.group(5) result["rundate"] = banner.group(6) result["runtime"] = banner.group(7) result["nlines"] = banner.group(0).count("\n") return result # Match standard CCP4 program termination def isccp4termination_standard(self,text): """Test if text matches a standard CCP4 program termination. If the match fails then return False; if it succeeds then return a dictionary with the following keys: termination_name: the program name as reported in the CCP4 termination message at the tail of the program log. termination_message: the message text displayed in the CCP4 termination message, e.g. 'Normal termination'. usertime: the value of the 'user time' given at termination. systemtime: the value of the 'system time' given at termination. elapsedtime: the value of the 'elapsed time' given at termination.""" # # Termination looks like: # Refmac_5.2.0019: End of Refmac_5.2.0019 # Times: User: 6.0s System: 0.4s Elapsed: 0:07 # # (Note that older program logs may have additional or different # whitespace arrangements) # if "Times: User: " not in text: return dict() term = self.compile("isccp4termination_standard",r" *([A-Za-z0-9_().]+): *([^\n]+)\n *Times: User: +([0-9.]+)s System: +([0-9.]+)s Elapsed: +([0-9:]+) *").search(text) result = dict() if term: result["termination_text"] = term.group(0) result["termination_name"] = term.group(1) result["termination_message"] = term.group(2) result["usertime"] = term.group(3) result["systemtime"] = term.group(4) result["elapsedtime"] = term.group(5) result["nlines"] = term.group(0).count("\n") return result # Match "phaser-style" CCP4 banner def isccp4banner_phaser(self,text): """Test if text matches a 'phaser-style' CCP4 program banner. 'Phaser-style' banners look similar to CCP4 banners but contain some different information. They are also used by the 'pointless' program. If the match fails then return False; if it succeeds then return a dictionary with the following keys: name: the name of the program from the banner. version: the reported program version. user: the user id that appears in the banner at runtime. rundate: the date that the program run started at as reported in the program banner. runtime: the time of day that the program run started at as reported in the program banner. os: corresponds to the 'os type' as reported in the banner, for example 'linux'. date: corresponds to the 'release date' of the program as reported in the banner. ccp4version: currently set to '?'.""" # This style of banner looks like: # 1234567890123456789012345678901234567890123456789012345678901234567890123456789012345 # ##################################################################################### # ##################################################################################### # ##################################################################################### # ### CCP4 PROGRAM SUITE: Phaser 1.3.2 ### # ##################################################################################### # User: pjx # Run time: Wed May 17 09:27:42 2006 # Version: 1.3.2 # OS type: linux # Release Date: Sun Feb 5 17:29:18 2006 # # Note: # 1. The "OS type" line may not always be present # 2. Pointless also writes a similar banner, but with "CCP4 SUITE" # substituted for "CCP4 PROGRAM SUITE" # # The regular expression accommodates both these differences. if "### CCP" not in text: return dict() banner = self.compile("isccp4banner_phaser",r"#+\n#+\n#+\n### CCP4 (PROGRAM )?SUITE: ([A-Za-z0-9_.]+) *([0-9.]+) *###\n#+(?:\nUser: *)?([^\n]*)\nRun time: *([A-Za-z0-9: /]+)\nVersion: *([0-9.]+)(?:\nOS type: *)?([^\n]*)\nRelease Date: *([A-Za-z0-9: /]+)").search(text) result = dict() if banner: ##print "Identified Phaser-style banner" result["banner_text"] = banner.group(0) result["name"] = banner.group(2) result["version"] = banner.group(3) result["user"] = banner.group(4) result["rundate"] = banner.group(5) result["runtime"] = banner.group(5) result["os"] = banner.group(7) result["date"] = banner.group(8) result["ccp4version"] = "?" result["nlines"] = banner.group(0).count("\n") return result # Match "phaser-style" CCP4 program termination def isccp4termination_phaser(self,text): """Test if text matches a 'phaser-style' CCP4 program termination. If the match fails then return False; if it succeeds then return a dictionary with the following keys: termination_name: the program name as reported in the CCP4 termination message at the tail of the program log. termination_message: the message text displayed in the CCP4 termination message, e.g. 'SUCCESS'. systemtime: the value of the 'CPU time' given at termination. Note that this is a subset of the attributes collected for standard CCP4 termination messages.""" # This style of banner looks like: # 12345678012345678012 # -------------------- # EXIT STATUS: SUCCESS # -------------------- # # CPU Time: 0 days 0 hrs 1 mins 34.43 secs (94.43 secs) # Finished: Wed May 17 09:29:25 2006 if "EXIT STATUS:" not in text: return dict() term = self.compile("isccp4termination_phaser",r"\-*\nEXIT STATUS: *([^\n]+)\n\-*\n\nCPU Time: *([A-Za-z0-9 \.\(\)]*)\nFinished: *([A-Za-z0-9 \.\(\)]*)").search(text) result = dict() if term: result["termination_text"] = term.group(0) result["termination_message"] = term.group(1) result["systemtime"] = term.group(2) result["nlines"] = term.group(0).count("\n") return result # Match old-style standard CCP4 program banner def isccp4banner_old(self,text): """Test if text matches an old-style CCP4 program banner. 'Old-style' banners come from versions of CCP4 predating version 4.1 of the suite. If the match fails then return False; if it succeeds then return a dictionary with the following keys: name: the name of the program from the CCP4 banner. version: the program version; for CCP4 programs, this is the version found in the program banner. For programs that don't explicitly give their own version number this will be the same as the CCP4 library version. date: the date string found in the CCP4 banner; it is typically the last date that the source code file was committed to CVS. (It is not the date that the program was run on - for that, use 'rundate' and 'runtime'). ccp4version: the CCP4 library version as it appears in the program banner. Typically this includes only the major and minor version numbers, but not the patch level. user: the user id that appears in the CCP4 banner at runtime. runtime: the time of day that the program run started at as reported in the program banner. rundate: the date that the program run started at as reported in the program banner.""" # # Banner looks like: # 123456789012345678901234567890123456789012345678901234567890 # 1########################################################## # ########################################################## # ########################################################## # ### CCP PROGRAM SUITE: dm VERSION 4.0: 26/11/98## # ########################################################## # User: pjx Run date: 3/16/00 Run time:14:12:40 if "### CCP" not in text: return dict() banner = self.compile("isccp4banner_old",r"1#{58,58}\n #{58,58}\n #{58,58}\n ### CCP PROGRAM SUITE: ([A-Za-z0-9_().]+) *VERSION ([0-9.]+) *: ([0-9 /]+)##\n #{58,58}\n User: ([^ ]+) *Run date: ([0-9 /]+) Run time:([0-9:]+) ?").search(text) result = dict() if banner: result["banner_text"] = banner.group(0) result["name"] = banner.group(1) result["ccp4version"] = banner.group(2) result["version"] = result["ccp4version"] result["date"] = banner.group(3) result["user"] = banner.group(4) result["rundate"] = banner.group(5) result["runtime"] = banner.group(6) result["nlines"] = banner.group(0).count("\n") return result # Match CCP4 keytext i.e. $TEXT ... def isccp4keytext(self,text): """Test if text matches CCP4 keytext message ($TEXT). If the match fails then return False; if it succeeds then return a dictionary with the following keys: name: the message 'name' or identifier junk_text: 'junk' text provided by the program (normally ignored) message: the message text nlines: the number of lines of text covered by the entire keytext message block.""" # # See e.g. http://www.ccp4.ac.uk/dist/html/loggraphformat.html # for format of TEXT information, but essentially it's: # # $TEXT :text name: $$ junk (ignored) text $$any text characters$$ # keytext = self.compile("isccp4keytext",r"\$TEXT[ \n]*:([^:]*):[ \n]*\$\$([^\$]*)\$\$([^\$]*)\$\$").search(text) result = dict() if keytext: result["name"] = keytext.group(1) result["junk_text"] = keytext.group(2) result["message"] = keytext.group(3) result["nlines"] = keytext.group(0).count("\n") return result # Match CCP4 TABLE def isccp4table(self,text): """Test if text matches CCP4 logfile table ($TABLE). If the match fails then return False; if it succeeds then return a dictionary with the following keys: rawtable: the exact text of the table as it appeared in the logfile title: the title of the table. type: the table type. graphs: the text of the $GRAPHS portion of the table text. columns: the text of the column headers in the table. text: the 'junk' text after the column headers and before the actual table data. data: the text of the table data (i.e. columns and rows of numbers or other data. nlines: the number of lines of text covered by the entire table block. These data items are themselves relatively unprocessed, so it is recommended that the text that matches the table should be fed into a 'table' object which provides a much easier to use interface to the various bits of data in the table. a table""" # # See e.g. http://www.ccp4.ac.uk/dist/html/loggraphformat.html # for format of TABLES # # Note that this regular expression accommodates slight deviations # by making the "closing" ":" of the $TABLE line optional. # This is done for consistency with loggraph's behaviour. # # Set up regular expression for entire table # This is the "strict" form of the table table = self.compile("isccp4table",r" *\$TABLE ?:([^:]*):?[ \n]+\$(GRAPHS|SCATTER)[^:]*(:[^\$]*)\$\$([^\$]*)\$\$([^\$]*)\$\$([^\$]*)\$\$").search(text) result = dict(); if table: result["rawtable"] = table.group(0) result["title"] = table.group(1).strip() result["type"] = table.group(2).strip() result["graphs"] = table.group(3) result["columns"] = table.group(4) result["text"] = table.group(5) result["data"] = table.group(6) result["nlines"] = table.group(0).count("\n") return result # If there wasn't a match then try a simpler match # This relaxes some of the rules in the format definintion table = self.compile("isccp4simplertable",r" *\$TABLE ?:([^\n]*)\n+\$(GRAPHS|SCATTER)[^:]*(:[^\$]*)\$\$([^\$]*)\$\$([^\$]*)\$\$([^\$]*)\$\$").search(text) if table: result["rawtable"] = table.group(0) result["title"] = table.group(1).strip() result["type"] = table.group(2).strip() result["graphs"] = table.group(3) result["columns"] = table.group(4) result["text"] = table.group(5) result["data"] = table.group(6) result["nlines"] = table.group(0).count("\n") return result return result # Match CCP4i header information def isccp4iheader(self,text): """Test if text matches a CCP4i header line.""" # # CCP4i header lines look like: # #CCP4I VERSION CCP4Interface 1.4.1 # #CCP4I SCRIPT LOG refmac5 # #CCP4I DATE 25 Oct 2005 15:19:22 # if self.isccp4itail(text): # Reject tail elements return "" header = self.compile("isccp4iheader",r"#CCP4I (.*)").search(text) result = "" if header: result = header.group(0) return result # Match CCP4i header information def isccp4itail(self,text): """Test if text matches a CCP4i tail line.""" # # CCP4i tail lines look like: # #CCP4I TERMINATION STATUS 1 # #CCP4I TERMINATION TIME 25 Oct 2005 15:19:30 # #CCP4I TERMINATION OUTPUT_FILES /home/pjx/PROJECTS/myProject/... # #CCP4I MESSAGE Task completed successfully # tail = self.compile("isccp4itail",r"#CCP4I (TERMINATION|MESSAGE) (.*)").search(text) result = "" if tail: result = tail.group(0) return result # Match CCP4i information text def isccp4i_information(self,text): """Test if text matches a CCP4i information block.""" # # CCP4i information lines look like: # 123456789012345678901234567890123456789012345678901234567890123456789012345 # *************************************************************************** # * Information from CCP4Interface script # *************************************************************************** # Running SHELXC to prepare data for heavy atom search # *************************************************************************** # info = self.compile("isccp4iinformation",r"\*{75,75}\n\* Information from CCP4Interface script\n\*{75,75}\n(.*)\n\*{75,75}").search(text) result = dict() if info: result["message"] = info.group(1) result["nlines"] = info.group(0).count("\n") return result # Match SHELX banners # 123456789012345678901234567890123456789012345678901234567890123456789012 # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # + SHELXC - Create input files for SHELXD and SHELXE - Version 2006/3 + # + Copyright (C) George M. Sheldrick 2003-6 + # + SHELX_56_shelxc Started at 14:30:07 on 21 Apr 2006 + # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # + SHELXD-2006/3 - MACROMOLECULAR DIRECT METHODS - FORTRAN-95 VERSION + # + Copyright (C) George M. Sheldrick 2000-2006 + # + SHELX_56_shelxd_fa started at 14:30:11 on 21 Apr 2006 + # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # 12345678901234567890123456789012345678901234567890123456789012345678 # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # + SHELXE - PHASING AND DENSITY MODIFICATION - Version 2006/3 + # + Copyright (C) George M. Sheldrick 2001-6 + # + Started at 14:30:36 on 21 Apr 2006 + # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # def isshelxbanner(self,text): """Test if text matches a SHELX program banner. This function tries to match the banners from SHELXC, SHELXD and SHELXE. If the match fails then return False; if it succeeds then return a dictionary with the following keys: name: the program name. version: the program version. runtime: the time of day that the program run started at as reported in the program banner. rundate: the date that the program run started at as reported in the program banner.""" # Set up regular expression for partial SHELX banner if "SHELX" not in text: return dict() banner = self.compile("isshelxbanner",r"\+{68,72}\n \+ (SHELXC|SHELXD|SHELXE)([^\+]*)\+\n \+ Copyright \(C\) *George M. Sheldrick[^\n]*\n \+ ([^\+]*)\+\n \+{68,72}").search(text) result = dict() if banner: result["banner_text"] = banner.group(0) result["name"] = banner.group(1) result["nlines"] = banner.group(0).count("\n") # Try and untangle the versions name = banner.group(1) version = False if name == "SHELXC" or name == "SHELXE": version = re.compile(r"Version ([^ \+]*)").search(banner.group(2)) elif name == "SHELXD": version = re.compile(r"\-([^ \-]*)").search(banner.group(2)) if version: result["version"] = version.group(1) else: result["version"] = "?" # Try and untangle the start times date = re.compile(r"(S|s)tarted at ([0-9:]+) on ([0-9A-Za-z ]+)").search( \ banner.group(3)) if date: result["runtime"] = str(date.group(2)) result["rundate"] = str(date.group(3)) else: result["runtime"] = "?" result["rundate"] = "?" return result # Match SHELX program termination def isshelxtermination(self,text): """Test if text matches a SHELX program termination. This function tries to match the messages from SHELXC, SHELXD and SHELXE. If the match fails then return False; if it succeeds then return a dictionary with the following keys: termination_name: the program name as reported in the SHELX termination message at the tail of the program log. termination_message: the message text displayed in the termination message. The content of this text varies between SHELXC and SHELXD/E so no further processing is currently attempted.""" # # Termination looks like: # 123456789012345678901234567890123456789012345678901234567890123456789012 # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # + SHELXC for SHELX_56_shelxc finished at 14:30:11 on 21 Apr 2006 + # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # + SHELXD finished at 14:30:36 Total time: 23.43 secs + # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # + SHELXE finished at 14:34:01 Total time: 198.15 secs + # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # # Set up regular expression for partial SHELX termination if "SHELX" not in text: return dict() term = self.compile("isshelxtermination",r"\+{68,72}\n \+ (SHELXC|SHELXD|SHELXE)([^\+]*)\+\n \+{68,72}").search(text) result = dict() if term: result["termination_text"] = term.group(0) result["termination_name"] = term.group(1) result["termination_message"] = term.group(2) result["nlines"] = term.group(0).count("\n") return result # Match program keyword input line ("Data line") def isdataline(self,line): """Test if line matches a CCP4 keyword input line. This function tries to match the keyword input lines. If the match fails then return False; if it succeeds then returns a dictionary with the following keys: data_line: the keyword data""" # # Keyworded lines look like: # # Data line--- make check NONE # # Set up regular expression for keyword input lines data = self.compile("isdataline",r"^ Data line--- ([^\n]*)\n").search(line) result = dict() if data: result["data_line_text"] = data.group(0) result["data_line"] = data.group(1) result["nlines"] = data.group(0).count("\n") return result # Match CCP4 file opening line (logical name/filename) def isfileopen(self,line): """Test if line matches a CCP4 file opening report. This function tries to match the reports of a file opening event from the CCP4 libraries, which report the logical name and associated filename. If the match fails then return False; if it succeeds then returns a dictionary with the following keys: logical_name: the logical name filename: the associated filename""" # # File opening report lines look like: # # Logical Name: /home/pjx/PROJECTS/myProject/aucn.mtz Filename: /home/pjx/PROJECTS/myProject/aucn.mtz # # Set up regular expression for file opening report lines fileopen = self.compile("isfileopen",r"^ Logical Name: ([^\n]*) Filename: ([^\n]*)\n").search(line) result = dict() if fileopen: result["fileopen_text"] = fileopen.group(0) result["logical_name"] = fileopen.group(1).strip() result["filename"] = fileopen.group(2).strip() return result # Match CCP4 SUMMARY_BEGIN line (summary start) def issummary_begin(self,line): """Test if line matches a CCP4 SUMMARY_BEGIN line. This function tries to match lines that indicate the start of a CCP4 summary block i.e. lines containing the text ''. If the match fails then return False; if it succeeds then return True.""" # # Summary start lines look like: # # # Set up regular expression for SUMMARY_BEGIN lines summary = self.compile("issummary_begin",r"<\!--SUMMARY_BEGIN-->").search(line) if summary: return True return False # Match CCP4 SUMMARY_END line (summary end) def issummary_end(self,line): """Test if line matches a CCP4 SUMMARY_END line. This function tries to match lines that indicate the end of a CCP4 summary block i.e. lines containing the text ''. If the match fails then return False; if it succeeds then return True.""" # # Summary start lines look like: # # # Set up regular expression for SUMMARY_BEGIN lines summary = self.compile("issummary_end",r"<\!--SUMMARY_END-->").search(line) if summary: return True return False ####################################################################### # External Functions ####################################################################### # parselog # # Given the name of a logfile, populates and returns a # logfile object def parselog(filen,progress=0): """Process a file and return a populated logfile object. parselog takes a file name as input; optionally if the progress argument is set to a positive integer then the function also reports its progress when it reaches a multiple of that number of lines. parselog works by reading the source file one line at a time from beginning to end. Each line is added to two buffers: a 'small' buffer, which stores the last 10 lines read, and a 'large' tablebuffer, which can store the last 1000 lines. After a line has been added, the small buffer is checked against a series of regular expressions designed to match the various features (banners, terminations and so on). If a match is found then parselog updates the logfile object that it is constructing and then clears the buffer. The tablebuffer is also checked at each line, to see if it contains a whole CCP4 logfile table. (The tablebuffer is a specialised form of buffer which is intended to optimise dealing with tables). The buffer sizes for the small and large buffers affect the speed of operation of parselog - if they are large then the parsing is slower because larger chunks of text are being tested multiple times. However if the buffers are too small to accommodate some of the logfile features then parselog is unable to detect those features. As some logfiles can contain extremely large tables, the tablebuffer must also be large. However other features are generally quite small. The other factor that can affect the speed of parsing is the make-up of the logfile. Counterintuitively, long files that contain few recognisable features can take longer because the buffers are only infrequently flushed.""" # Process a file and return a populated logfile object # # Maximum size of text buffer to use bufsize = 500 # Initial size of chunks to process chunksize = 500 # Regular expression object regex = patternmatch() # Buffer objects buff = buffer(bufsize) tablebuff = tablebuffer() linecount = 0 # New (empty) logfile object log = logfile(filen) prog = False summary = None # Open the file for reading f = open(filen,"r") # Read line-by-line for line in f: linecount += 1 # Progress indicator (if requested) # Report reaching "progress" number of lines if progress: if not linecount % progress: print "Processed "+str(linecount)+" lines" # Append line to buffers buff.append(line) tablebuff.append(line) # Get a chunk of text to process bufftext = buff.tail(chunksize) # Test the line for matches # # Data line i.e. CCP4 program keywords result = regex.isdataline(line) if result: if not prog or not prog.isprogram(): # Found a data line outside the context # of a program # Assume that we are now inside a program prog = log.addprogram() # Set the start line to be immediately # after the previous fragment try: previous_fragment = log.fragment(log.nfragments()-2) start = previous_fragment.get_endline() + 1 except IndexError: # Failed to get end line of previous # fragment start = 0 log.set_fragment_start(start) # Remove any html tags and store data_line = strip_logfile_html(result["data_line"]) prog.addkeyword(data_line) # File opening report line i.e. logical name/filename pairs result = regex.isfileopen(line) if result: if not prog or not prog.isprogram(): # Found a file opening report outside the context # of a program # Assume that we are now inside a program prog = log.addprogram() # Set the start line to be immediately # after the previous fragment try: previous_fragment = log.fragment(log.nfragments()-2) start = previous_fragment.get_endline() + 1 except IndexError: # Failed to get end line of previous # fragment start = 0 log.set_fragment_start(start) # Store the logical name/filename pair prog.addlogicalname(result["logical_name"],result["filename"]) # Start of a summary block i.e. result = regex.issummary_begin(line) if result: summary = log.addsummary(linecount) # End of a summary block i.e. result = regex.issummary_end(line) if result: if not summary: # Make a new summary with no start summary = log.addsummary() # Close out the current summary summary.set_end(linecount) # Test the buffer for matches # # CCP4 program banner result = regex.isccp4banner(bufftext) if result: ##print "Found CCP4 program banner" ##print "Result = "+str(result) prog = log.addprogram() prog.set_isccp4(True) prog.set_attributes_from_dictionary(result) log.set_fragment_start(linecount) buff.clear() tablebuff.clear() continue # SHELX program banner result = regex.isshelxbanner(bufftext) if result: ##print "Found SHELX program banner" ##print "Result = "+str(result) prog = log.addprogram() prog.set_attributes_from_dictionary(result) log.set_fragment_start(linecount) buff.clear() tablebuff.clear() continue # CCP4 program termination result = regex.isccp4termination(bufftext) if result: ##print "Found CCP4 program termination" ##print "Result = "+str(result) if not prog: # Outside the context of any fragment, and # found the end of a program before its start log.set_fragment_end(offsetline(linecount,result)) prog = log.addprogram() elif not prog.isprogram(): # Within the context of a fragment which # is not a program and found the end of a # program before its start log.set_fragment_end(offsetline(linecount,result)) prog = log.addprogram() prog.set_attributes_from_dictionary(result) log.set_fragment_end(linecount) prog.set_termination(True) # Clear the current pointer prog = False buff.clear() tablebuff.clear() continue # SHELX program termination result = regex.isshelxtermination(bufftext) if result: ##print "Found SHELX program termination" ##print "Result = "+str(result) if not prog: # Found the end of a program before its start prog = log.addprogram() prog.set_attributes_from_dictionary(result) log.set_fragment_end(linecount) prog.set_termination(True) # Clear the current pointer prog = False buff.clear() tablebuff.clear() continue # CCP4 table if tablebuff.complete(): if not prog: # Found a table outside the context of a program ##print "Adding table as a fragment" prog = log.newfragment() log.set_fragment_start(linecount) table_error = False table = prog.addtable(tablebuff.all()) if not table: print "*** Failed to extract table data ***" table_error = True elif table.parse_error(): print "*** Failed to parse table data ***" table_error = True if table_error: print "\tLogfile: "+str(log.filename()) print "\tTable start: L"+str(linecount - len(tablebuff) + 1) print "\tTable end : L"+str(linecount) # Add the table to the log, regardless of status log.addtable(table) # clear the buffers buff.clear() tablebuff.clear() continue #('$$' in line) and line.startswith() below are short-circuiting # calls to regex.is*(). # This optimization speeds the script ~2x in typical cases) # CCP4 keytext message result = regex.isccp4keytext(bufftext) if ('$$' in line) else None if result: ##print "Found CCP4 keytext" ##print "Result = "+str(result) if not prog: # Found a message outside the context of a program ##print "Adding keytext as a fragment" prog = log.newfragment() log.set_fragment_start(linecount) keytext = prog.addkeytext(result["name"], \ result["junk_text"], \ result["message"]) log.addkeytext(keytext) buff.clear() tablebuff.clear() continue # CCP4i header result = regex.isccp4iheader(bufftext) if line.startswith("#CCP4I") else None if result: ##print "Found CCP4i header" ##print "Result = "+str(result) log.append_ccp4i_header(result) buff.clear() continue # CCP4i tail result = regex.isccp4itail(bufftext) if line.startswith("#CCP4I") else None if result: ##print "Found CCP4i tail" ##print "Result = "+str(result) log.append_ccp4i_tail(result) buff.clear() tablebuff.clear() continue # CCP4i information result = regex.isccp4i_information(bufftext) if line.startswith("***") else None if result: ##print "Found CCP4i information" ##print "Result = "+str(result) # Make a new fragment - these messages shouldn't # appear inside the context of another program prog = log.addccp4i_info() prog.set_attributes_from_dictionary(result) log.set_fragment_start(linecount) log.set_fragment_end(linecount) # Clear the current context prog = False buff.clear() tablebuff.clear() continue # Ensure that the endline of the last fragment # is assigned log.set_fragment_end(linecount) # Close the file f.close() return log # # summarise # # Produce a summary of the data in a logfile object # def summarise(thislog): """Summarise the content of a logfile object. This function takes a logfile object as input and writes a a summary of the contents (fragments, programs, tables, messages and so on) to stdout.""" # Logfile name print "Summary for "+thislog.filename()+"\n" # Was it from CCP4i? if thislog.isccp4i(): print "This is a CCP4i logfile\n" # Number of programs or pseudo-programs print str(thislog.nfragments())+" logfile fragments\n" print "Fragments:" for i in range(0,thislog.nfragments()): fragment = thislog.fragment(i) if fragment.isprogram(): if fragment.has_attribute("name"): print "\tProgram: "+str(fragment.name) else: print "\tProgram: " else: if fragment.isccp4i_info(): print "\tCCP4i info" elif fragment.isfragment(): print "\tFragment" if fragment.ntables(): print "\t\t"+str(fragment.ntables())+" tables" if fragment.nkeytexts(): print "\t\t"+str(fragment.nkeytexts())+" keytexts" print "" # Summarise program logfile fragments if thislog.nprograms() > 0: print str(thislog.nprograms())+" program logfiles\n" print "Programs:" for i in range(0,thislog.nprograms()): prog = thislog.program(i) # Is it a CCP4 program? if prog.isccp4(): # Print name, version (and CCP4 version) print "\t"+prog.name+ \ "\tv"+prog.version+ \ "\t(CCP4 "+prog.ccp4version+")" else: # Print name and version if prog.has_attribute("name") and prog.has_attribute("version"): print "\t"+prog.name+"\t"+prog.version else: print "\t" if prog.termination(): print "\tTerminated with: "+prog.termination_message else: print "\tNo termination message found" # Keytexts if prog.nkeytexts(): print "\n\t\tKeytext messages:" for j in range(0,prog.nkeytexts()): print "\t\t"+str(prog.keytext(j).name())+ \ ": \""+str(prog.keytext(j).message())+"\"" # Tables if prog.ntables(): print "\n\t\tTables:" for table in prog.tables(): print "\t\tTable: \""+table.title()+"\"" print "" else: print "No program logfiles found" print "" # Total set of CCP4i information messages in the file print "CCP4i messages in file:" if thislog.nccp4i_info(): for i in range(0,thislog.nccp4i_info()): print "\tCCP4i info: \""+thislog.ccp4i_info(i).message+"\"" else: print "\tNo messages found" print "" # Total set of tables in the file print "Tables in file:" if thislog.ntables(): for table in thislog.tables(): print "\tTable: \""+table.title()+"\" ("+str(table.nrows())+" rows)" else: print "\tNo tables found" print "" # Total set of keytexts in the file print "Keytext messages in file:" if thislog.nkeytexts(): for i in range(0,thislog.nkeytexts()): print "\t"+str(thislog.keytext(i).name())+ \ ": \""+thislog.keytext(i).message()+"\"" else: print "\tNo keytext messages found" print "" ####################################################################### # Utility Functions ####################################################################### def copyfragment(fragment0,newobj): """Copy the data in a fragment to another object. The data in the source fragment 'fragment0' is copied to the target object 'newobj', and 'newobj' is returned. 'newobj' should be a fragment object or some subclass of fragment (such as a 'program' object). copyfragment can be used to 'mutate' a fragment into (for example) a program object.""" # Copy attribute data for item in fragment0.attributes(): newobj[item] = fragment0.get_attribute(item) # Copy tables for tbl in fragment0.tables(): newobj.addtable(tbl) # Copy keytexts for i in range(0,fragment0.nkeytexts()): keytext = fragment0.keytext(i) newobj.addkeytext(keytext.name(), keytext.junk_text(), keytext.message()) # Try to copy other attributes that fragment subclasses # have (such as keywords) try: for line in fragment0.keywords(): newobj.addkeyword(line) except AttributeError: # Either the source or target doesn't support # keyword storage pass # Return the populated object return newobj def offsetline(linen,pattern_result): """Return the line number offset by the size of a matched pattern. This is an internal utility function. Given 'linen' (the current line number) and 'pattern_result' (a dictionary containing data items returned from one of the regular expression functions), this function returns a line number which is offset to the start of the regular expression. It does this by locating a dictionary key 'nlines', which gives the size of the regular expression match.""" if pattern_result.has_key("nlines"): nlines = pattern_result["nlines"] else: nlines = 0 new_linen = linen - nlines - 1 if new_linen < 0: return 0 else: return new_linen def find_table_by_title(table_list,title_pattern,index=0): """Fetch a table object from a list by matching the title. This method is deprecated; use find_tables_by_title instead. This method looks up a particular table in a list of table objects (argument 'table_list'), by finding the first table in the list which matches the supplied regular expression 'title_pattern'. If there is more than one matching table then the 'index' argument specifies which of the list of matching tables should be returned. If index is out of range (or there are no matching tables) then return 'None'.""" rtable_list = find_tables_by_title(table_list,title_pattern) try: return rtable_list[index] except: return None def find_tables_by_title(table_list,title_pattern): """Return a list of tables by matching the title. This method returns a list of table objects containing all the tables in the supplied list 'table_list' for which the regular expression 'title_pattern' matches the table title. If no pattern is given then a list with all the table objects will be returned. A list is always returned, so in cases where there are no matches an empty list is returned, and if there is just one match then a list with a single item is returned.""" pattern=title_pattern.replace("(", "\\(").replace(")", "\\)") title_pattern=pattern rtitle = re.compile(title_pattern) rtable_list=[] for table in table_list: if rtitle.match(table.title()): rtable_list.append(table) return rtable_list def escape_xml_characters(data): """Return copy of string with XML special characters escaped. This replaces the characters <, > and & with the XML escape sequences <, > and &. It also replaces double quotes with ". This could be replaced in future by the 'xml.sax.saxutils.escape' function.""" return str(data).replace("&","&").replace("<","<").replace(">",">").replace('"',""") def strip_logfile_html(text): """Strip out HTML tags from logfile text. Return copy of input 'text' with HTML tags removed and any HTML special characters escaped. Note that this is specialised for CCP4 logfiles, in particular CCP4-formatted logfiles will be extracted from tags.""" out_text = "" buff = "" start_tag = "" end_tag = "" context = "none" for i in range(len(text)): c = text[i] #print "c = "+str(c)+" context = "+str(context) if c == "<": if context == "none": # Possible start of a tag, depending on # next character context = "putative_tag" buff = c else: # Everything up to this needs to # be dumped directly to output out_text = out_text + escape_xml_characters(buff) elif context == "putative_tag": buff = buff + c if c.isalpha(): context = "start_tag" elif c == "/": context = "end_tag" elif c == "!": context = "comment_tag" else: # Not a tag so dump it context = "none" out_text = out_text + escape_xml_characters(buff) elif context == "start_tag" \ or context == "end_tag" \ or context == "comment_tag": buff = buff + c if c == ">": if context == "start_tag": # End of a start tag # Process it and see if we can # salvage something salvage_text = salvage_tag_data(buff) if salvage_text != "": out_text = out_text + escape_xml_characters(salvage_text) # Reset the buffer context = "none" buff = "" elif context == "end_tag": # End of an end tag # Throw this away (for now) context = "none" buff = "" elif context == "comment_tag": # End of a comment # Throw this away (for now) context = "none" buff = "" else: # Nothing special about this # Add to the output out_text = out_text + escape_xml_characters(c) # Finished - append the remaining buffer out_text = out_text + escape_xml_characters(buff) return remove_blank_lines(out_text) def remove_blank_lines(text): """Remove duplicated blank lines from text. This function tries to remove extra blank lines from the supplied text, so that multiple blank lines are collapsed to just a single line.""" out_text = "" blank = True for line in text.splitlines(True): if line.isspace(): if not blank: blank = True out_text = out_text + line else: blank = False out_text = out_text + line return out_text def process_start_tag(tag_text): """Process an arbitrary HTML start tag. Given the text of an arbitrary tag, this function returns a tuple consisting of two elements. The first element is the tag name, the second is a dictionary with keys corresponding to attributes found in the tag and the values of those keys corresponding to the attribute values.""" tokens = tokenise(tag_text.strip("<>")) tag = tokens[0] attributes = {} if len(tokens) > 1: for token in tokens[1:]: try: i = token.index("=") key = token[0:i] value = token[i+1:].strip(" \"") except ValueError: key = token value = "" attributes[key] = value return (tag,attributes) def salvage_tag_data(tag_text): """Extract data from a HTML tag. This function deals with extracting the useful data from certain HTML tags found in CCP4 logfiles. Currently it is set up to extract CCP4 table data from the 'param' tags of JLogGraph applets. If no data could be salvaged then an empty string is returned.""" data = process_start_tag(tag_text) tag = data[0] attributes = data[1] # Jloggraph applet data if tag == "param" and attributes.has_key("name"): if attributes["name"] == "table" and attributes.has_key("value"): return attributes["value"] # Spacegroup if tag_is_spacegroup(tag_text): return tag_text # Return an empty string by default return "" def tag_is_spacegroup(text): """Check if a HTML tag looks like a spacegroup name. This does a very crude test to see whether the supplied string looks like a spacegroup name (rather than a random HTML tag).""" spacegroup = re.compile(r"$") result = spacegroup.search(text) if result: return True else: return False def retrieve(filen,start,end): """Retrieve a block of text from a file. Given the name of a file 'filen' and a pair of start and end line numbers, extract and return the text from the file. This uses the linecache module - beware of problems with consuming too much memory if the cache isn't cleared.""" text = "" # Check for consistency and validity of lines if start < 0 and end < 0 or end < start: return "" # Fetch from a file if possible if os.path.isfile(filen): try: for i in range(start,end+1): text = text+str(linecache.getline(filen,i)) return text except: print "Exception raised in retrieve method:" print "\tSource file = \""+str(filen)+"\"" print "\tStart line = "+str(start) print "\tEnd line = "+str(end) print "\tCurrent line = "+str(i) raise # Otherwise return nothing return "" def tokenise(line): """Tokenise a string and return a list. Split a line of text into tokens separated by whitespace, but ignoring whitespace that appears within quotes. This attempts to do a similar to job the CCP4 'parser' (which is itself actually a tokeniser) in the core CCP4 libraries. The hard part is dealing with quoted strings which form a single token, and which can themselves also contain quotes.""" sline = str(line) tokens = [] token = False quote = False tquote = "" start = 0 for i in range(len(sline)): c = sline[i] if token and not quote: if c == " " or c == "\t" or c == "\n": # end of current token tokens.append(sline[start:i]) token = False quote = False if token and ( c == '"' or c == "'" ): # Detected a quote - flip the quote flag if quote: if c == tquote: quote = False else: quote = True tquote = c if not token: if c != " " and c != "\t" and c != "\n": # Start of a new token token = True start = i if c == '"' or c == "'": # Also it's quoted quote = True tquote = c # End of the loop if token: # End of the last token tokens.append(sline[start:len(sline)]) return tokens ############################################################## # Diagnostic methods used for testing and as examples ############################################################## # List the TABLE tags in a file def table_tags(filen): """Report occurances of '$TABLE' tags in a log file. This function is principally a diagnostic tool and is independent of the other classes and methods in this module. It takes the name of a log file as input, scans the file for occurances of the $TABLE tag, and reports this to stdout.""" print "Scanning file "+str(filen) rtable = re.compile(r"\$TABLE *:") f = open(filen,"r") linecount = 0 tablecount = 0 tablelist = [] for line in f: linecount = linecount + 1 table = rtable.search(line) if table: tablecount = tablecount + 1 print str(linecount)+": "+str(line.rstrip("\n")) tablelist.append(line.rstrip("\n")) f.close() print str(linecount)+" lines and "+str(tablecount)+" tables" return tablelist # An example of making a new table from scratch def table_example(): """Demonstration function that creates and populates a table object. This function is for demonstration purposes only; it shows the basics of how to make and output a table. It creates a new table object, names it, populates some columns of data and then adds some graph definitions before outputting the formatted table to stdout.""" print "\nExample making a new table from scratch:\n" # Make a new (empty) table object tbl = table("A table with random data") # Add three columns called "x", "x^2" and "1/x" tbl.addcolumn("x") tbl.addcolumn("x^2") tbl.addcolumn("1/x") # Add some rows of data for i in range(0,10): row = dict() row["x"] = i row["x^2"] = i*i if i != 0: row["1/x"] = 1.0/float(i) else: row["1/x"] = "?" tbl.add_data(row) # Define some graphs tbl.definegraph("Y = X(squared)",("x","x^2")) tbl.definegraph("Y = 1/X",("x","1/x")) tbl.definegraph("All data",("x","x^2","1/x")) # Print out the data as a simple "table" and in loggraph markup print tbl.show() print tbl.loggraph() if __name__ == "__main__": """Usage example and demonstration for smartie. Run the main program using: python smartie.py file1 [file2 [...] ] For each file this example will generate a logfile object and then use the logfile 'summarise' method to print out a summary of the file's contents.""" print "Running test on logparser code" # Get the command line args print "command line: "+str(sys.argv) if len(sys.argv) == 1: print "Usage: smartie.py file1 [file2 [...] ]" sys.exit(0) # Cycle over files and process for filen in sys.argv[1:]: print "**** Parsing file \""+filen+"\"" start_time = time.clock() log = parselog(filen) end_time = time.clock() # Use the summarise function summarise(log) print "\nTime: "+str(end_time-start_time)+"\n"