# File lib/perseus_match.rb, line 69 def check(*args) check!(*args) rescue CheckFailedError false end
# File lib/perseus_match.rb, line 75 def check!(phrase, target, threshold = 0, operator = :>, pm_options = {}, attribute = :similarity) pm = new(phrase, target, pm_options) value = pm.send(attribute) if value.send(operator, threshold) Struct.new(:pm, :value, :threshold, :operator).new(pm, value, threshold, operator) else raise CheckFailedError.new(pm, value, threshold, operator) end end
# File lib/perseus_match.rb, line 65 def cluster(phrases, options = {}, pm_options = {}) Cluster.new(phrases, pm_options).rank(options) end
# File lib/perseus_match.rb, line 57 def distance(*args) new(*args).distance end
# File lib/perseus_match.rb, line 61 def match(phrases, pm_options = {}) List.new(phrases, pm_options) end
# File lib/perseus_match.rb, line 98 def initialize(phrase, target, options = {}) @phrase = sanitize(phrase.to_s) @target = sanitize(target.to_s) @default_coeff = options[:default_coeff] || DEFAULT_COEFF @distance_spec = options[:distance_spec] || DISTANCE_SPEC @verbose = options[:verbose] @similarity = {} end
# File lib/perseus_match.rb, line 86 def tokenize(form, unknowns = false) if file = TokenSet.file?(form) TokenSet.tokenize(file, unknowns) else PhraseTokenSet.tokenize(form, unknowns) end end
0 <= distance <= Infinity
# File lib/perseus_match.rb, line 119 def distance @distance ||= calculate_distance end
# File lib/perseus_match.rb, line 110 def phrase_tokens @phrase_tokens ||= self.class.tokenize(phrase) end
1 >= similarity >= 0
# File lib/perseus_match.rb, line 124 def similarity(coeff = nil) coeff ||= default_coeff # passed arg may be nil @similarity[coeff] ||= normalize_distance(coeff) end
# File lib/perseus_match.rb, line 114 def target_tokens @target_tokens ||= self.class.tokenize(target) end
# File lib/perseus_match.rb, line 135 def calculate_distance return Infinity if phrase_tokens.disjoint?(target_tokens) return 0 if phrase_tokens.eql?(target_tokens) distance_spec.inject(0) { |distance, (options, weight)| distance + token_distance(options) * weight } end
# File lib/perseus_match.rb, line 170 def normalize_distance(coeff) length = phrase_tokens.size + target_tokens.size return 0 if length == 0 norm = Math.log(length ** Math.sqrt(2)) * coeff * total_weight * Math::E 1 / Math.exp(distance / norm) end
# File lib/perseus_match.rb, line 131 def sanitize(str) str.gsub(/\s*\(.*?\)|\s*\[.*?\]/, '').sub(/\s*[\/:].*/, '') end
# File lib/perseus_match.rb, line 144 def token_distance(options = {}) tokens1 = phrase_tokens.inclexcl(options) tokens2 = target_tokens.inclexcl(options) if options[:sort] tokens1 = tokens1.sort tokens2 = tokens2.sort end if options[:soundex] tokens1 = tokens1.soundex tokens2 = tokens2.soundex end distance = tokens1.distance(tokens2) warn "#{options.inspect}: #{tokens1.inspect} #{tokens2.inspect} => #{distance} " if verbose distance end
# File lib/perseus_match.rb, line 179 def total_weight @total_weight ||= distance_spec.inject(0.0) { |total, (_, weight)| total + weight } end