Map month abbreviations to their two-digit number equivalent.
By default we will consider the current year.
NOTE: :id
should contain capture group for ID
This is what we start with – all zero.
# File lib/log2counter/parser.rb, line 67 def load(csv_file) FasterCSV.new(csv_file, :headers => true).inject({}) { |stats, row| month, licensee, name, address, sessions, searches, downloads = row.fields (((stats[month] ||= {})[licensee] ||= {})[name] ||= {})[address] = { :sessions => sessions.to_i, :searches => searches.to_i, :downloads => downloads.to_i } stats } end
# File lib/log2counter/parser.rb, line 85 def initialize(log_file, licensees, months = nil, regexp = nil) @log_file = log_file @months = months || DEFAULT_MONTHS raise ArgumentError, "illegal format for month; must be YYYY_MM" if @months.any? { |month| month !~ /\A\d\d\d\d_\d\d\z/ } @regexp = DEFAULT_REGEXP.merge(regexp || {}) @constraint = Regexp.union(*@regexp.values) @licensees = licensees.reject { |_, hash| !hash[:export] } initialize_licensees end
Now here's the method you want to call. Returns a hash:
stats = { '2007_06' => { 'Somewhere, Inst.' => { '12.34.56.78' => { :sessions => 12, :searches => 34, :downloads => 56 }, ... }, ... }, ... }
# File lib/log2counter/parser.rb, line 116 def parse # Cache resolved host names. addr2addr = Hash.new { |hash, addr| hash[addr] = begin Resolv.getaddress(addr) rescue Resolv::ResolvError addr end } # Cache licensees. addr2lcee = Hash.new { |hash, addr| hash[addr] = licensees_by_ip.get(addr) } # Our result hash stats = {} # Create a new LogParser and send our log file. Yields a hash per line. LogParser.new(:minimal, constraint).parse_io_stream(log_file) { |stat| path = stat['PATH_INFO'] # Skip lines that don't have any useful information for us anyway. next unless path =~ constraint # Maybe we already captured the licensee ID? (see DEFAULT_REGEXP above) id = $1 m, y = stat['DATETIME'][/\/(.*?):/, 1].split('/') # Extract month and year month = [y, ABBR2MONTH[m]].join('_') # Target format is 'YYYY_MM' # Skip lines that fall out of the range we're interested in. next unless months.include?(month) address = addr2addr[stat['REMOTE_ADDR']] licensee = addr2lcee[address] || licensees_by_id[ URI.decode(id || path[regexp[:id], 1] || '') ] # Couldn't find a matching licensee? Skip it! next unless licensee name = licensee[:name] licensee = licensee[:licensee] (((stats[month] ||= {})[licensee] ||= {})[name] ||= {})[address] ||= DEFAULT_STATS.dup _address = stats[month][licensee][name][address] # Increment our counts, since that's what we're here for... _address[:sessions] += 1 if path =~ regexp[:login] _address[:searches] += 1 if path =~ regexp[:search] _address[:downloads] += 1 if path =~ regexp[:download] } # Now we need to fill in any months and licensees we didn't come across before. months.each { |month| stats[month] ||= {} licensees.each { |licensee, hash| stats[month][licensee] ||= {} addresses = stats[month][licensee][hash[:name]] if addresses # Drop entries with zero sessions -- how come they occur, anyway? addresses.delete_if { |_, stat| stat[:sessions].zero? } end # Add a default "empty" entry for completeness' sake. if addresses.nil? || addresses.empty? stats[month][licensee][hash[:name]] = { nil => DEFAULT_STATS } end } } # That's it, return what we've got. stats end
Create additional mappings for our licensees.
# File lib/log2counter/parser.rb, line 197 def initialize_licensees @licensees_by_ip = {} @licensees_by_id = {} licensees.each { |licensee, hash| _hash = { :licensee => licensee, :name => hash[:name]} hash[:ip].each { |ip| licensees_by_ip[ip] = _hash } hash[:id].each { |id| licensees_by_id[id] = _hash } } # Convenience method to get a licensee from an address. Note that # +licensees_by_ip+ usually has subnets instead of full IPs. def licensees_by_ip.get(ip) find(lambda { [] }) { |key, _| ip[0, key.length] == key }.last end end