#!/usr/bin/ruby require 'rubygems' require 'sqlite3' class Uniq def initialize @db = @itab = @otab = nil @opts = { :dbfile => 'wmo9.sqlite', :itab => 'volc1raw', :otab => 'volc1', :sync => 200000, :where => '', :ttaaii => nil, :cccc => nil, :limits => '' } end def []= key, val @opts[key] = val end def open for t in [@opts[:itab], @opts[:otab]] raise "bad table name #{t}" unless /^[.\w]+$/ =~ t end @db = SQLite3::Database.new(@opts[:dbfile]) @itab = @opts[:itab] @otab = @opts[:otab] end def close @db.close if @db end def mkdb sql = <<-ENDSQL DROP TABLE IF EXISTS #{@otab}; CREATE TABLE #{@otab}( region TEXT NOT NULL, rth TEXT NOT NULL, country TEXT NOT NULL, centre TEXT NOT NULL, date DOUBLE, category TEXT, ttaaii TEXT NOT NULL, cccc TEXT NOT NULL, codeform TEXT, timegroup TEXT, content TEXT, remarks TEXT, CONSTRAINT ahl UNIQUE (ttaaii, cccc) ); CREATE INDEX #{@otab}ahl ON #{@otab} (ttaaii, cccc); ENDSQL @db.execute_batch(sql) end def simplecopy ttaaii, cccc sql = <<-ENDSQL INSERT INTO #{@otab} SELECT * FROM #{@itab} WHERE ttaaii = ? AND cccc = ? ENDSQL @db.execute(sql, ttaaii, cccc) end def mergehack ttaaii, cccc sql = <<-ENDSQL SELECT * FROM #{@itab} WHERE ttaaii = ? AND cccc = ? ORDER BY oid ASC ENDSQL cols = nil recs = [] @db.execute2(sql, ttaaii, cccc) { |row| if cols.nil? then cols = row else rec = Hash[*cols.zip(row).flatten] recs.push rec end } warned = false 1.upto(recs.size - 1) { |i| rec = recs[i] for col in cols - %w(timegroup content date) next if rec[col] == recs.first[col] puts <<-EOF d #{ttaaii} #{cccc}\t#{i}.#{col}<#{rec[col]}>\t 0.#{col}<#{recs.first[col]}> EOF warned = true end } therec = recs.first.dup if not warned then if recs.all?{ |r| /^\s*([:\d]+(\s*,\s*[:\d]+)*)?\s*$/ =~ r['timegroup'] and /^\s*(XXX|NIL|\d{5}(\s+\d{5})*|[A-Z]{4}(\s+[A-Z]{4})*)\s*$/ =~ r['content'] } then values = recs.map{|r| r['timegroup'].strip.split(/\s*,\s*/)}.flatten therec['timegroup'] = values.uniq.sort.join(',') values = recs.map{|r| r['content'].strip.split(/\s+/)}.flatten therec['content'] = values.uniq.sort.join(' ') therec['date'] = recs.map{|r| r['date']}.compact.max puts "mc #{ttaaii} #{cccc} #{recs.size}" if $defout.tty? elsif recs.all?{|r| /^\s*\d+(\s*,\s*[:\d]+)*\s*$/ =~ r['timegroup'] } and recs.map{|r| r['content']}.uniq.size == 1 then values = recs.map{|r| r['timegroup'].strip.split(/\s*,\s*/)}.flatten therec['timegroup'] = values.uniq.sort.join(',') therec['date'] = recs.map{|r| r['date']}.compact.max puts "mt #{ttaaii} #{cccc} #{recs.size}" if $defout.tty? else #printf("%d %s\n", recs.map{|r| r['content']}.uniq.size, # recs.map{|r| /^\s*\d+(\s*,\s*\d+)*\s*$/ =~ r['timegroup']}.inspect #) 0.upto(recs.size - 1) { |i| rec = recs[i] puts <<-EOF u #{ttaaii} #{cccc} #{i}: timegroup<#{rec['timegroup']}> content<#{rec['content']}> EOF } end end sql = <<-ENDSQL INSERT INTO #{@otab}( region, rth, country, centre, date, category, ttaaii, cccc, codeform, timegroup, content, remarks ) VALUES ( :region, :rth, :country, :centre, :date, :category, :ttaaii, :cccc, :codeform, :timegroup, :content, :remarks ) ENDSQL @db.execute(sql, therec) end def distill if @opts[:ttaaii] or @opts[:cccc] then whereclause = 'WHERE ' + [@opts[:ttaaii], @opts[:cccc]].compact.join(' AND ') else whereclause = '' end sql = <<-ENDSQL SELECT ttaaii, cccc, COUNT(oid) FROM #{@itab} #{whereclause} GROUP BY ttaaii, cccc #{@opts[:limit]}; ENDSQL count = 0 @db.transaction @db.execute(sql) {|row| ttaaii, cccc, n = row case n.to_i when 1 simplecopy(ttaaii, cccc) else mergehack(ttaaii, cccc) end if (count = count.succ) > @opts[:sync] then @db.commit $defout.flush @db.transaction count = 0 end } puts "commit" if $defout.tty? @db.commit @db.execute('VACUUM') if @opts[:vacuum] end def run open mkdb distill close end end app = Uniq.new for arg in ARGV case arg when /^-nv/, /^--novacuum/ then app[:vacuum] = false when /^-f/, /^--file=/ then app[:dbfile] = $' when /^-i/, /^--intable=/ then app[:itab] = $' when /^-o/, /^--outtable=/ then app[:otab] = $' when /^-c/, /^--cycle=/ then app[:sync] = Integer($') when /^-l/, /^--limits=/ then app[:limits] = Integer($') when /^[A-Z]{4}[0-9]{2}$/ then app[:ttaaii] = "ttaaii = '#{$&}'" when /^[_A-Z]{4}[_0-9]{2}$/ then app[:ttaaii] = "LIKE(ttaaii, '#{$&}')" when /^[A-Z]{4}$/ then app[:cccc] = "cccc = '#{$&}'" when /^[_A-Z]{4}$/ then app[:cccc] = "LIKE(cccc, '#{$&}')" else puts <<-EOF usage: #{$0} [-options ...] [TTAAii] [CCCC] -fDB --file=DB database file [wmo9.sqlite] -iTAB --intable=TAB input table [volc1raw] -oTAB --outtable=TAB output table [volc1] -nv --novacuum do not VACUUM at exit -cN --cycle=N commit transaction for each N [1000] headings -lN --limits=N limit conversion to N [unlimited] headings EOF exit 1 end end app.run