class Cassiopee::Crawler
Base class to index and search through a string
Constants
- FILE_SUFFIX_EXT
- FILE_SUFFIX_POS
- METHOD_DIRECT
- METHOD_SUFFIX
- SUFFIXLEN
Attributes
Ambiguity map (Hash)
Array of comment characters to skip lines in input sequence file
Suffix files name/path
Max number fo threads to use (not yet used)
Method for search FORCE or SUFFIX
-
SUFFIX loads all suffixes and search through them afterwards, interesting for multiple searches (suffixes are reused)
-
FORCE checks matches while crossing the suffixes. Does not keep parsed data for later search FORCE method does not yet support optimal filters
Use alphabet ambiguity (dna/rna) in search, automatically set with loadAmbiguityFile
Manage basic cache to store previous match
Use persistent suffix file ?
Public Class Methods
# File lib/cassiopee.rb, line 299 def initialize @useAmbiguity = false @ambiguous = nil @useCache = false @file_suffix = "crawler" @method = 0 @prev_min_position = 0 @prev_max_position = 0 @suffix = nil @suffixmd5 = nil @position = 0 @suffixes = Hash.new @matches = Array.new @curmatch = 0 @use_store = false @sequence = nil @comments = Array["#"] @cache = Cassiopee::CrawlerCache.new end
Public Instance Methods
Clear suffixes in memory If using #use_store, clear the store too
# File lib/cassiopee.rb, line 340 def clear @suffixes = Hash.new @matches.clear @pattern = nil @prev_max_position = 0 @prev_min_position = 0 @cache.clearCache() File.delete(@file_suffix+FILE_SUFFIX_POS) unless !File.exists?(@file_suffix+FILE_SUFFIX_POS) end
Extract un suffix from suffix file based on md5 match
# File lib/cassiopee.rb, line 586 def extractSuffix(start,len) sequence = '' begin file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r") file.pos = start sequence = file.read(len) file.close rescue => err puts "Exception: #{err}" return nil end return sequence end
Filter the array of positions with defined position filter
# File lib/cassiopee.rb, line 563 def filter(posArray) $log.debug("filter the position with " << @min_position.to_s << " and " << @max_position.to_s) if(@min_position==0 && @max_position==0) return posArray end filteredArray = Array.new i = 0 posArray.each do |pos| if(i==0) # First elt of array is match length filteredArray << pos end if(i>0 && pos>=@min_position && pos<=@max_position) filteredArray << pos end i +=1 end return filteredArray end
# File lib/cassiopee.rb, line 333 def filterCost filterOptimal(1) end
# File lib/cassiopee.rb, line 329 def filterLength filterOptimal(0) end
Filter matches to be between min and max start position If not using #use_store, search speed is improved but existing indexes are cleared If max=0, then max is string length Must be called after index creation or load
# File lib/cassiopee.rb, line 435 def filter_position(min,max) if(!use_store) clear() end @prev_min_position = @min_position @prev_max_position = @max_position @min_position = min @max_position = max end
Index an input file Clear existing indexes
# File lib/cassiopee.rb, line 359 def indexFile(f) # Parse file, map letters to reduced alphabet # Later on, use binary map instead of ascii map # Take all suffix, order by length, link to position map on other file # Store md5 for easier compare? + 20 bytes per suffix @sequence = readSequence(f) clear() @min_position = 0 @max_position = 0 end
Index an input string Clear existing indexes
# File lib/cassiopee.rb, line 373 def indexString(s) @sequence = s File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data| data.puts(@sequence) end clear() @min_position = 0 @max_position = 0 end
Load ambiguity rules from a file File format should be:
-
A=B,C D=E,F …
# File lib/cassiopee.rb, line 390 def loadAmbiguityFile(f) if(!File.exists?(f)) $log.error("File "<< f << "does not exists") exit(1) end @ambiguous = Hash.new file = File.new(f, "r") while (line = file.gets) definition = line.downcase.chomp ambdef = definition.split('=') ambequal = ambdef[1].split(',') @ambiguous[ambdef[0]] = ambequal end @useAmbiguity = true $log.debug("loaded ambiguity rules: " << @ambiguous.inspect()) file.close end
Load sequence from a previous index command
# File lib/cassiopee.rb, line 411 def loadIndex seq = '' begin file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r") while (line = file.gets) input = line.downcase.chomp seq << input end file.close rescue => err $log.error("Exception: #{err}") exit() end @sequence = seq clear() @min_position = 0 @max_position = 0 end
Iterates over matches
# File lib/cassiopee.rb, line 602 def next if(@curmatch<@matches.length) @curmatch = @curmatch + 1 return @matches[@curmatch-1] else @curmatch = 0 return nil end end
Search an approximate string
-
support insertion, deletion, substitution
-
If edit > 0, use Hamming
-
Else use Levenshtein
# File lib/cassiopee.rb, line 494 def searchApproximate(s,edit) if(edit==0 && !@useAmbiguity) return searchExact(s) end allowederrors = edit if(edit>=0) useHamming = true minmatchsize = s.length maxmatchsize = s.length updateCache(1,edit) @matches = @cache.loadCache() else useHamming = false edit = edit * (-1) minmatchsize = s.length - edit maxmatchsize = s.length + edit updateCache(2,edit) @matches = @cache.loadCache() end if(@matches.length>0) return @matches end s = s.downcase #@matches.clear @pattern = Digest::MD5.hexdigest(s) parseSuffixes(@sequence,minmatchsize,maxmatchsize,allowederrors,s) return cache?(@matches) unless(method == METHOD_SUFFIX) @suffixes.each do |md5val,posArray| if(md5val == SUFFIXLEN) next end if (md5val == @pattern) filteredPosArray = filter(posArray) match = Array[md5val, 0, filteredPosArray] $log.debug "Match: " << match.inspect @matches << match else if(posArray[0]>= minmatchsize && posArray[0] <= maxmatchsize) # Get string seq = extractSuffix(posArray[1],posArray[0]) errors = isApproximateEqual?(seq,s,useHamming,edit) if(errors>=0) filteredPosArray = filter(posArray) match = Array[md5val, errors, filteredPosArray] $log.debug "Match: " << match.inspect @matches << match end end end end return cache?(@matches) end
Search exact match
# File lib/cassiopee.rb, line 447 def searchExact(s) if(@useAmbiguity) return searchApproximate(s,0) end s = s.downcase updateCache(0,0) @matches = @cache.loadCache() if(@matches.length>0) return cache?(@matches) end #@matches.clear @pattern = Digest::MD5.hexdigest(s) parseSuffixes(@sequence,s.length,s.length,0,s) return @matches unless(method == METHOD_SUFFIX) # Search required length, compare (compare md5?) # MD5 = 128 bits, easier to compare for large strings matchsize = @pattern.length @suffixes.each do |md5val,posArray| if (isMatchEqual?(md5val)) match = Array[md5val, 0, posArray] $log.debug "Match: " << match.inspect @matches << match end end return cache?(@matches) end
Set Logger level
# File lib/cassiopee.rb, line 352 def setLogLevel(level) $log.level = level end
# File lib/cassiopee.rb, line 612 def to_pos positions = Hash.new @matches.each do |match| # match = Array[md5val, errors, posArray] i=0 len = 0 match[2].each do |pos| if(i==0) len = pos else if(positions.has_key?(pos)) posmatch = positions[pos] posmatch << Array[len,match[1]] else posmatch = Array.new posmatch << Array[len,match[1]] positions[pos] = posmatch end end i += 1 end end return positions.sort end
# File lib/cassiopee.rb, line 640 def to_s puts '{ matches: "' << @matches.length << '" }' end
Private Instance Methods
If cache is used, store results for later retrieval, else return matches directly
# File lib/cassiopee.rb, line 647 def cache?(results) if(@useCache) @cache.saveCache(results) end return results end
check if string is approximatly equal to pattern s: string to compare pattern: base pattern used useHamming: use Hamming or edit distance edit : allowed errors
# File lib/cassiopee.rb, line 679 def isApproximateEqual?(s,pattern,useHamming,edit) errors = -1 s.extend(Cassiopee) if(useHamming) if(@useAmbiguity && @ambiguous!=nil) errors = s.computeHammingAmbiguous(pattern,edit,@ambiguous) else errors = s.computeHamming(pattern,edit) end else if(@useAmbiguity && @ambiguous!=nil) errors = s.computeLevenshteinAmbiguous(pattern,edit,@ambigous) else errors = s.computeLevenshtein(pattern,edit) end end end
check if md5 is equal to pattern
# File lib/cassiopee.rb, line 667 def isMatchEqual?(s) if(@pattern == s) return true end return false end
Parse input string
-
creates a suffix file
-
creates a suffix position file
# File lib/cassiopee.rb, line 705 def parseSuffixes(s,minlen,maxlen,edit=0,pat=nil) # Controls if(minlen<=0) minlen = 1 end if(maxlen>@sequence.length) maxlen = @sequence.length end if(!use_store) minpos = @min_position if(@max_position==0) maxpos = @sequence.length else maxpos = @max_position end else minpos = 0 maxpos = @sequence.length - minlen end suffixlen = nil $log.info('Start indexing') loaded = false # Hash in memory already contain suffixes for searched lengths if(@suffixes != nil && !@suffixes.empty?) suffixlen = @suffixes[SUFFIXLEN] if(suffixlen!=nil && !suffixlen.empty?) loaded = true (maxlen).downto(minlen) do |len| if(suffixlen.index(len)==nil) loaded = false break end end end end if(@use_store && loaded) $log.debug('already in memory, skip file loading') end # If not already in memory if(@use_store && !loaded) @suffixes = loadSuffixes(@file_suffix+FILE_SUFFIX_POS) suffixlen = @suffixes[SUFFIXLEN] end nbSuffix = 0 changed = false # Load suffix between maxlen and minlen (maxlen).downto(minlen) do |i| $log.debug('parse for length ' << i.to_s) if(suffixlen!=nil && suffixlen.index(i)!=nil) $log.debug('length '<<i " next end changed = true prev_progress = -1 (minpos..(maxpos)).each do |j| # if position+length longer than sequence length, skip it if(j+i>@sequence.length) next end @suffix = s[j,i] @suffixmd5 = Digest::MD5.hexdigest(@suffix) @position = j progress = (@position * 100).div(@sequence.length) if((progress % 10) == 0 && progress > prev_progress) prev_progress = progress $log.debug("progress: " << progress.to_s) end if(method==METHOD_DIRECT) if(edit==0 && !@useAmbiguity) if(isMatchEqual?(@suffixmd5)) errors = 0 else errors = -1 end else if(edit>=0) useHamming = true allowederrors = edit else useHamming = false allowederrors = edit * (-1) end errors = isApproximateEqual?(@suffix,pat,useHamming,allowederrors) end if(errors>=0) match = Array[@suffixmd5, errors, Array[i,j]] $log.debug "Match: " << match.inspect @matches << match end else nbSuffix += addSuffix(@suffixmd5, @position,i) end end $log.debug("Nb suffix found: " << nbSuffix.to_s << ' for length ' << i.to_s) unless method==METHOD_DIRECT end if(@use_store && changed) $log.info("Store suffixes") marshal_dump = Marshal.dump(@suffixes) sfxpos = File.new(@file_suffix+FILE_SUFFIX_POS,'w') sfxpos = Zlib::GzipWriter.new(sfxpos) sfxpos.write marshal_dump sfxpos.close end $log.info('End of indexing') end # Add a suffix in Hashmap def addSuffix(md5val,position,len) if(@suffixes.has_key?(md5val)) # Add position @suffixes[md5val] << position else # Add position, write new suffix # First elt is size of elt @suffixes[md5val] = Array[len, position] if(@suffixes.has_key?(SUFFIXLEN)) @suffixes[SUFFIXLEN] << len else @suffixes[SUFFIXLEN] = Array[len] end end return 1 end # read input string, and concat content def readSequence(s) $log.debug('read input sequence') counter = 1 sequence = '' begin file = File.new(s, "r") File.delete(@file_suffix+FILE_SUFFIX_EXT) unless !File.exists?(@file_suffix+FILE_SUFFIX_EXT) File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data| while (line = file.gets) counter = counter + 1 input = line.downcase.chomp skip = false comments.each do |c| if(input[0] == c[0]) # Line start with a comment char, skip it $log.debug("skip line") skip = true break end end if(!skip) sequence << input data.puts input end end end file.close rescue => err puts "Exception: #{err}" err end $log.debug('data file created') return sequence end # Load suffix position file in memory def loadSuffixes(file_name) return Hash.new unless File.exists?(@file_suffix+FILE_SUFFIX_POS) begin file = Zlib::GzipReader.open(file_name) rescue Zlib::GzipFile::Error file = File.open(file_name, 'r') ensure obj = Marshal.load file.read file.close return obj end end # Filter @matches to keep only the longest or the error less matches for a same start position def filterOptimal(type) positions = Hash.new @matches.each do |match| # match = Array[md5val, errors, posArray] i=0 len = 0 match[2].each do |pos| if(i==0) len = pos else if(positions.has_key?(pos)) posmatch = positions[pos] posmatch << Array[len,match[1],match[0]] #positions[pos] << posmatch else posmatch = Array.new posmatch << Array[len,match[1],match[0]] positions[pos] = posmatch end end i += 1 end end matchtoremove = Array.new positions.each do |pos,posmatch| optimal = nil match = nil count = 0 newoptimal = nil newmatch = nil (0..posmatch.length-1).each do |i| solution = posmatch[i] if(i==0) if(type==0) # length optimal = solution[0] else # cost optimal = solution[1] end match = solution[2].to_s #count += 1 next end newmatch = solution[2].to_s if(type==0) # length newoptimal = solution[0] if(newoptimal.to_i>optimal.to_i) optimal = newoptimal matchtoremove << match match = newmatch else matchtoremove << newmatch end else # cost newoptimal = solution[1] if(newoptimal<optimal) optimal = newoptimal matchtoremove << match match = newmatch else matchtoremove << newmatch end end count += 1 end end newmatches = Array.new @matches.each do |match| found = false matchtoremove.each do |item| if(match[0]==item) found = true break end end if(!found) newmatches << match end end @matches = newmatches end end end ")
Update cache object with current object parameters
-
method: 0 -> exact, 1 -> hamming, 2 -> edit
# File lib/cassiopee.rb, line 657 def updateCache(method,errors) @cache.file_suffix = @file_suffix @cache.min_position = @min_position @cache.max_position = @max_position @cache.method = method @cache.errors = errors end