~/projects/ruby/threadsplit-0.1a/threadsplit.rb.html

#!/usr/local/bin/ruby -w
###########################################################
# threadsplit.rb -- splits up TAG threads for processing  #
# Version:          0.1a (minor update)                   #
# Author:           Thomas Adam <thomas_adam16@yahoo.com> #
# Akndments:        Heather Stern <star@starshine.org>    #
# Date:             Friday October 10 2003, 03:12am GMT   #
# Last Change:      Thursday 18th Dec 2003, 06:35am GMT   #
###########################################################

#require 'profile'

# set some variables...
basedir = File.expand_path("~/lgmail/bait")            # $(pwd)
sdir = "Raw"                                           # subdir
mb = "Mail98"                                          # name of inbox
mlprefix = Regexp.escape("[TAG]")                      # prefix for ML
pathname = File.expand_path("#{basedir}/#{sdir}/#{mb}")# mbox location
tagmails = /^Subject: #{mlprefix}\s+([^Rr][^Ee]:?).*/  # regex object
vthread = []                                           # subject threads
mbox = []                                              # *the* mbox
total = 0                                              # output for debug

# generate a list of known threads...
vthread.concat( File.new(pathname, "r").
                grep(/^Subject: #{mlprefix}\s+([^Rr][^Ee]:?).*/).uniq()
              )

# Read the mbox in question...
File.new(pathname, "r").each() { |a|
  case a
    when /^(From .*)/;      #Start of an e-mail
      mbox << [$1];         #append to array
    else mbox.last() << a;  #body of e-mail (to_a())
  end
}

#Originally, I had planned to match the whole line, but I am lazy.
#split(/^From \w+\@\w+([.]\w)+\s+([...] [...]\d+) (\d+:\d+:\d+) \d{4}/)

# controlling block
filenew = []    #The array to hold the threads

#Taking each initiating thread
vthread.each { |b|

  #strip the "Subject:" bit from it (used as our filename)
  nm = b[8..b.length()]

  #open up the mbox in question
  mbox.each { |c|
    # since the order of the mbox is known, we know that the first matched made
    # will be the originating thread. Anything else after that are replies.

    if c.any?{ |d| d.include?(nm) }
      #puts g       # Testing only.
      filenew << c  # appends to array (note, not the same as Array.concat() )
    end
  }

  # Print something to the screen (debugging...)
  $stderr.puts("Processing: #{nm}")

  # Remove [TAG] and any meta-shell characters that even "ls -b" hates.
  nm.squeeze!(" ")    #compacts any multiple-spaces to single ones
  nm.gsub!(/ #{mlprefix} /, '').gsub!(/[\s?":.,*+\/]/o, '_')
  nm.squeeze!('_')
  nm.sub!(/_$/, '')

  $stderr.puts("Replies: #{filenew.length()}")
  total += filenew.length().to_i()

  # Intelligence time. If the length of the thread == 1 then we can assume
  # that no-one has answered the thread -- hence "wanted" section. Either
  # that, or it is spam, in which case....
  if filenew.length() == 1
    outfile = File.new("#{basedir}/Q/#{nm}", "w")
    outfile.puts(filenew)
    filenew = []
  else
    # Prepare to append thread to file
    output = File.new("#{basedir}/T/#{nm}", "w")
    output.puts(filenew)              # write to file
    filenew = []                      # start over again with blank array
  end
}

$stderr.puts("\nTotal = #{total}")

# That's it, we've finished.