class PerseusMatch::TokenSet

Attributes

form[R]
tokens[R]

Public Class Methods

file?(form) click to toggle source
# File lib/perseus_match/token_set.rb, line 146
def file?(form)
  file = Pathname.new(form).absolute? ? form : File.expand_path(form)
  file if File.file?(file) && File.readable?(file)
end
new(form, tokens = nil) click to toggle source
Calls superclass method
# File lib/perseus_match/token_set.rb, line 191
def initialize(form, tokens = nil)
  super(tokens || self.class.tokenize(form))

  @form   = form
  @tokens = to_a
end
tokenize(form, unknowns = false) click to toggle source
# File lib/perseus_match/token_set.rb, line 98
def tokenize(form, unknowns = false)
  form.downcase!
  return @tokens[form] if @tokens ||= nil

  @_tokens = Hash.new
  @tokens  = Hash.new { |h, k| h[k] = new(k, @_tokens[k] || []) }

  tokens_file = ENV['PM_TOKENS_FILE'] || 'perseus.tokens'

  if File.readable?(tokens_file)
    File.open(tokens_file) { |f| parse(f, unknowns, @_tokens) }
    @tokens[form]
  else
    raise "Lingo installation not found at #{LINGO_BASE}" unless LINGO_FOUND

    cfg = Tempfile.open(['perseus_match_lingo', '.cfg']) { |t|
      YAML.dump(LINGO_CONFIG, t)
    }

    file = file?(form)

    if keep = ENV['PM_KEEP_TOKENS']
      keep = File.expand_path(keep =~ /\A(?:1|y(?:es)?|true)\z/ ? tokens_file : keep)
    end

    begin
      Dir.chdir(LINGO_BASE) {
        Process.ruby(*%W[lingo.rb -c #{cfg.path}]) { |_, i, o, _|
          file ? File.foreach(file) { |line| i.puts line } : i.puts(form)

          i.close_write
          tokens = o.read

          File.open(keep, 'w') { |f| f.puts tokens } if keep
          parse(tokens, unknowns, @_tokens)
        }
      }
    ensure
      cfg.unlink
    end

    unless file
      tokens, @tokens = @tokens[form], nil
      tokens
    end
  end
end

Private Class Methods

parse(output, unknowns = false, tokens = {}) click to toggle source
# File lib/perseus_match/token_set.rb, line 153
def parse(output, unknowns = false, tokens = {})
  sanitize = lambda { |a|
    a.sub!(Token::WC_RE, '')
    a.downcase!
  }

  output.each_line { |res|
    case res
      when /<(.*?)\s=\s\[(.*)\]>/
        a, b = $1, $2
        sanitize[a]

        tokens[a] ||= b.scan(/\((.*?)\+?\)/).flatten.map { |t| Token.new(t) }
      when /<(.*)>/, /:(.*):/
        a, b = $1, Token.new($1.downcase)
        sanitize[a]

        if unknowns && b.unk?
          if unknowns.respond_to?(:<<)
            unknowns << a
          else
            warn "UNK: #{a} [#{res.strip}]"
          end
        end

        tokens[a] ||= [b]
    end
  }

  tokens
end

Public Instance Methods

==(other) click to toggle source
# File lib/perseus_match/token_set.rb, line 231
def ==(other)
  tokens == other.tokens
end
disjoint?(other) click to toggle source
# File lib/perseus_match/token_set.rb, line 206
def disjoint?(other)
  (forms.flatten & other.forms.flatten).flatten.empty?
end
distance(other) click to toggle source
# File lib/perseus_match/token_set.rb, line 198
def distance(other)
  (forms | other.forms).size - (forms & other.forms).size
end
eql?(other) click to toggle source
# File lib/perseus_match/token_set.rb, line 235
def eql?(other)
  self == other && form == other.form
end
excl(wcs) click to toggle source
# File lib/perseus_match/token_set.rb, line 218
def excl(wcs)
  self.class.new(form, reject { |token| token.match?(wcs) })
end
forms() click to toggle source
# File lib/perseus_match/token_set.rb, line 202
def forms
  @forms ||= map { |token| token.form }
end
incl(wcs) click to toggle source
# File lib/perseus_match/token_set.rb, line 214
def incl(wcs)
  self.class.new(form, select { |token| token.match?(wcs) })
end
inclexcl(inclexcl = {}) click to toggle source
# File lib/perseus_match/token_set.rb, line 210
def inclexcl(inclexcl = {})
  incl(inclexcl[:incl] || Token::ANY_WC).excl(inclexcl[:excl])
end
inspect() click to toggle source
# File lib/perseus_match/token_set.rb, line 239
def inspect
  "#{super}<#{form}>"
end
Also aliased as: to_s
soundex() click to toggle source
# File lib/perseus_match/token_set.rb, line 222
def soundex
  ensure_soundex!

  @soundex ||= self.class.new(form, map { |token|
    form = token.form.replace_diacritics.sub(/\W+/, '')
    Token.new(Text::Soundex.soundex(form) || '', token.wc)
  })
end
to_s()
Alias for: inspect

Private Instance Methods

ensure_soundex!() click to toggle source
# File lib/perseus_match/token_set.rb, line 247
def ensure_soundex!
  unless defined?(Text::Soundex)
    raise RuntimeError, "Soundex functionality not available", caller(1)
  end
end