M
Martin Hansen
Hello all,
The below code is too slow for practical use. I need it to run at least
20 times faster. Perhaps that is possible with some C code? I have no
experience with writing Ruby extensions. What are the pitfalls? Which
part of the code should be ported? Any pointers to get me started?
Cheers,
Martin
#!/usr/bin/env ruby
# IUPAC nucleotide pair ambiguity equivalents are saved in an
# array of bit fields.
BIT_A = 1 << 0
BIT_T = 1 << 1
BIT_C = 1 << 2
BIT_G = 1 << 3
EQUAL = Array.new(256, 0)
EQUAL['A'.ord] = BIT_A
EQUAL['T'.ord] = BIT_T
EQUAL['U'.ord] = BIT_T
EQUAL['C'.ord] = BIT_C
EQUAL['G'.ord] = BIT_G
EQUAL['M'.ord] = (BIT_A|BIT_C)
EQUAL['R'.ord] = (BIT_A|BIT_G)
EQUAL['W'.ord] = (BIT_A|BIT_T)
EQUAL['S'.ord] = (BIT_C|BIT_G)
EQUAL['Y'.ord] = (BIT_C|BIT_T)
EQUAL['K'.ord] = (BIT_G|BIT_T)
EQUAL['B'.ord] = (BIT_C|BIT_G|BIT_T)
EQUAL['D'.ord] = (BIT_A|BIT_G|BIT_T)
EQUAL['H'.ord] = (BIT_A|BIT_C|BIT_T)
EQUAL['V'.ord] = (BIT_A|BIT_C|BIT_G)
EQUAL['N'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T)
# Module containing code to locate nucleotide patterns in sequences
allowing for
# ambiguity codes and a given maximum edit distance.
# Insertions are nucleotides found in the pattern but not in the
sequence.
# Deletions are nucleotides found in the sequence but not in the
pattern.
#
# Inspired by the paper by Bruno Woltzenlogel Paleo (page 197):
# http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf
module PatternMatcher
#
------------------------------------------------------------------------------
# str.match(pattern[, pos[, max_edit_distance]])
# -> Match or nil
#
#
------------------------------------------------------------------------------
# Method to locate the next pattern match starting from a given
position. A match
# is allowed to contain a given maximum edit distance. If a match is
located a
# Match object will be returned otherwise nil.
def match(pattern, pos = 0, max_edit_distance = 0)
@pattern = pattern
@pos = pos
@max_edit_distance = max_edit_distance
@vector = vector_init
while @pos < @seq.length
vector_update
return match_new if match_found?
@pos += 1
end
end
#
------------------------------------------------------------------------------
# str.scan(pattern[, pos[, max_edit_distance]])
# -> Array
# str.scan(pattern[, pos[, max_edit_distance]]) { |match|
# block
# }
# -> Match
#
#
------------------------------------------------------------------------------
# Method to iterate through a sequence to locate pattern matches
starting
# from a given position and allowing for a maximum edit distance.
# Matches found in block context return the Match object. Otherwise
matches are
# returned in an Array.
def scan(pattern, pos = 0, max_edit_distance = 0)
matches = []
while match = match(pattern, pos, max_edit_distance)
if block_given?
yield match
else
matches << match
end
pos = match.pos + 1
end
return matches unless block_given?
end
private
# Method to initailize the score vector and return this.
def vector_init
vector = []
(0 ... @pattern.length + 1).each do |i|
vector = Score.new(matches = 0, mismatches = 0, insertions = i)
end
vector
end
# Method to update the score vector.
def vector_update
new_vector = @vector.dup
(0 ... @pattern.length).each do |i|
if match?(@seq[@pos], @pattern)
new_vector[i + 1] = @vector.dup
new_vector[i + 1].matches += 1
else
mismatch = @vector.dup
insertion = new_vector.dup
deletion = @vector[i + 1].dup
if deletion?(mismatch, insertion, deletion)
deletion.deletions += 1
new_vector[i + 1] = deletion
elsif mismatch?(mismatch, insertion, deletion)
mismatch.mismatches += 1
new_vector[i + 1] = mismatch
elsif insertion?(mismatch, insertion, deletion)
insertion.insertions += 1
new_vector[i + 1] = insertion
end
end
end
@vector = new_vector
end
# Method to determine if a match occurred.
def match?(char1, char2)
(EQUAL[char1.upcase.ord] & EQUAL[char2.upcase.ord]) != 0
end
# Method to determine if a mismatch occured.
def mismatch?(mismatch, insertion, deletion)
if mismatch.edit_distance <= insertion.edit_distance and
mismatch.edit_distance <= deletion.edit_distance
true
end
end
# Method to determine if an insertion occured.
def insertion?(mismatch, insertion, deletion)
if insertion.edit_distance <= mismatch.edit_distance and
insertion.edit_distance <= deletion.edit_distance
true
end
end
# Method to determine if a deletion occured.
def deletion?(mismatch, insertion, deletion)
if deletion.edit_distance <= mismatch.edit_distance and
deletion.edit_distance <= insertion.edit_distance
true
end
end
# Method to print the score vector.
def vector_print
@vector.each do |s|
puts s
end
puts
end
# Method that returns a Match object initialized with
# information from the score vector.
def match_new
matches = @vector.last.matches
mismatches = @vector.last.mismatches
insertions = @vector.last.insertions
deletions = @vector.last.deletions
length = @pattern.length - insertions + deletions
pos = @pos - length + 1
match = @seq[pos ... pos + length]
Match.new(pos, match, matches, mismatches, insertions, deletions,
length)
end
# Method that determines if a match was found by analyzing the score
vector.
def match_found?
if @vector.last.edit_distance <= @max_edit_distance
true
end
end
# Class to instantiate Score objects that holds score information.
class Score
attr_accessor :matches, :mismatches, :insertions, :deletions
def initialize(matches = 0, mismatches = 0, insertions = 0,
deletions = 0)
@matches = matches
@mismatches = mismatches
@insertions = insertions
@deletions = deletions
end
# Method to calculate and return the edit distance.
def edit_distance
self.mismatches + self.insertions + self.deletions
end
private
def to_s
"(#{[self.matches, self.mismatches, self.insertions,
self.deletions].join(',')})"
end
end
# Class for creating Match objects which contain the description of a
# match between a nucleotide sequence and a pattern.
class Match
attr_reader os, :match, :matches, :mismatches, :insertions,
:deletions, :edit_distance, :length
def initialize(pos, match, matches, mismatches, insertions,
deletions, length)
@pos = pos
@match = match
@matches = matches
@mismatches = mismatches
@insertions = insertions
@deletions = deletions
@edit_distance = mismatches + insertions + deletions
@length = length
end
end
end
The below code is too slow for practical use. I need it to run at least
20 times faster. Perhaps that is possible with some C code? I have no
experience with writing Ruby extensions. What are the pitfalls? Which
part of the code should be ported? Any pointers to get me started?
Cheers,
Martin
#!/usr/bin/env ruby
# IUPAC nucleotide pair ambiguity equivalents are saved in an
# array of bit fields.
BIT_A = 1 << 0
BIT_T = 1 << 1
BIT_C = 1 << 2
BIT_G = 1 << 3
EQUAL = Array.new(256, 0)
EQUAL['A'.ord] = BIT_A
EQUAL['T'.ord] = BIT_T
EQUAL['U'.ord] = BIT_T
EQUAL['C'.ord] = BIT_C
EQUAL['G'.ord] = BIT_G
EQUAL['M'.ord] = (BIT_A|BIT_C)
EQUAL['R'.ord] = (BIT_A|BIT_G)
EQUAL['W'.ord] = (BIT_A|BIT_T)
EQUAL['S'.ord] = (BIT_C|BIT_G)
EQUAL['Y'.ord] = (BIT_C|BIT_T)
EQUAL['K'.ord] = (BIT_G|BIT_T)
EQUAL['B'.ord] = (BIT_C|BIT_G|BIT_T)
EQUAL['D'.ord] = (BIT_A|BIT_G|BIT_T)
EQUAL['H'.ord] = (BIT_A|BIT_C|BIT_T)
EQUAL['V'.ord] = (BIT_A|BIT_C|BIT_G)
EQUAL['N'.ord] = (BIT_A|BIT_C|BIT_G|BIT_T)
# Module containing code to locate nucleotide patterns in sequences
allowing for
# ambiguity codes and a given maximum edit distance.
# Insertions are nucleotides found in the pattern but not in the
sequence.
# Deletions are nucleotides found in the sequence but not in the
pattern.
#
# Inspired by the paper by Bruno Woltzenlogel Paleo (page 197):
# http://www.logic.at/people/bruno/Papers/2007-GATE-ESSLLI.pdf
module PatternMatcher
#
------------------------------------------------------------------------------
# str.match(pattern[, pos[, max_edit_distance]])
# -> Match or nil
#
#
------------------------------------------------------------------------------
# Method to locate the next pattern match starting from a given
position. A match
# is allowed to contain a given maximum edit distance. If a match is
located a
# Match object will be returned otherwise nil.
def match(pattern, pos = 0, max_edit_distance = 0)
@pattern = pattern
@pos = pos
@max_edit_distance = max_edit_distance
@vector = vector_init
while @pos < @seq.length
vector_update
return match_new if match_found?
@pos += 1
end
end
#
------------------------------------------------------------------------------
# str.scan(pattern[, pos[, max_edit_distance]])
# -> Array
# str.scan(pattern[, pos[, max_edit_distance]]) { |match|
# block
# }
# -> Match
#
#
------------------------------------------------------------------------------
# Method to iterate through a sequence to locate pattern matches
starting
# from a given position and allowing for a maximum edit distance.
# Matches found in block context return the Match object. Otherwise
matches are
# returned in an Array.
def scan(pattern, pos = 0, max_edit_distance = 0)
matches = []
while match = match(pattern, pos, max_edit_distance)
if block_given?
yield match
else
matches << match
end
pos = match.pos + 1
end
return matches unless block_given?
end
private
# Method to initailize the score vector and return this.
def vector_init
vector = []
(0 ... @pattern.length + 1).each do |i|
vector = Score.new(matches = 0, mismatches = 0, insertions = i)
end
vector
end
# Method to update the score vector.
def vector_update
new_vector = @vector.dup
(0 ... @pattern.length).each do |i|
if match?(@seq[@pos], @pattern)
new_vector[i + 1] = @vector.dup
new_vector[i + 1].matches += 1
else
mismatch = @vector.dup
insertion = new_vector.dup
deletion = @vector[i + 1].dup
if deletion?(mismatch, insertion, deletion)
deletion.deletions += 1
new_vector[i + 1] = deletion
elsif mismatch?(mismatch, insertion, deletion)
mismatch.mismatches += 1
new_vector[i + 1] = mismatch
elsif insertion?(mismatch, insertion, deletion)
insertion.insertions += 1
new_vector[i + 1] = insertion
end
end
end
@vector = new_vector
end
# Method to determine if a match occurred.
def match?(char1, char2)
(EQUAL[char1.upcase.ord] & EQUAL[char2.upcase.ord]) != 0
end
# Method to determine if a mismatch occured.
def mismatch?(mismatch, insertion, deletion)
if mismatch.edit_distance <= insertion.edit_distance and
mismatch.edit_distance <= deletion.edit_distance
true
end
end
# Method to determine if an insertion occured.
def insertion?(mismatch, insertion, deletion)
if insertion.edit_distance <= mismatch.edit_distance and
insertion.edit_distance <= deletion.edit_distance
true
end
end
# Method to determine if a deletion occured.
def deletion?(mismatch, insertion, deletion)
if deletion.edit_distance <= mismatch.edit_distance and
deletion.edit_distance <= insertion.edit_distance
true
end
end
# Method to print the score vector.
def vector_print
@vector.each do |s|
puts s
end
puts
end
# Method that returns a Match object initialized with
# information from the score vector.
def match_new
matches = @vector.last.matches
mismatches = @vector.last.mismatches
insertions = @vector.last.insertions
deletions = @vector.last.deletions
length = @pattern.length - insertions + deletions
pos = @pos - length + 1
match = @seq[pos ... pos + length]
Match.new(pos, match, matches, mismatches, insertions, deletions,
length)
end
# Method that determines if a match was found by analyzing the score
vector.
def match_found?
if @vector.last.edit_distance <= @max_edit_distance
true
end
end
# Class to instantiate Score objects that holds score information.
class Score
attr_accessor :matches, :mismatches, :insertions, :deletions
def initialize(matches = 0, mismatches = 0, insertions = 0,
deletions = 0)
@matches = matches
@mismatches = mismatches
@insertions = insertions
@deletions = deletions
end
# Method to calculate and return the edit distance.
def edit_distance
self.mismatches + self.insertions + self.deletions
end
private
def to_s
"(#{[self.matches, self.mismatches, self.insertions,
self.deletions].join(',')})"
end
end
# Class for creating Match objects which contain the description of a
# match between a nucleotide sequence and a pattern.
class Match
attr_reader os, :match, :matches, :mismatches, :insertions,
:deletions, :edit_distance, :length
def initialize(pos, match, matches, mismatches, insertions,
deletions, length)
@pos = pos
@match = match
@matches = matches
@mismatches = mismatches
@insertions = insertions
@deletions = deletions
@edit_distance = mismatches + insertions + deletions
@length = length
end
end
end