#! /usr/bin/perl -w

# srom: Sucks-Rules-O-Meter

# Copyright 1998 Electric Lichen L.L.C.
# Don Marti <dmarti@electriclichen.com>

# 29 October 2001 -- added (Mac) OS X

# 20 February 2001 -- switched to Raging Search

# revised 15 January 2000 -- added OpenBSD. 

# revised 9 July 2000 -- added logging functionality 
# (Johan Walles, d92-jwa @ nada.kth.se)

# revised 3 June 1999 -- new AltaVista result page format

# revised 19 Mar 1998 -- added $rule_offset

#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.

require 5.004;
# require "SimpleGet.pl";
use LWP::Simple;

my $SEARCH_PREFIX = qq{http://altavista.com/web/results?q=%2B%22};

my $SEARCH_SUFFIX = '%22';

# The VOTES_THRESHOLD value determines how much the number of votes
# for a certain OS affects its weighted result.  OSes with exactly
# this many votes will have a weighted score that is exactly in
# between its raw score and the average rating for all operating
# systems.  For more on this, se the discussion of the "true Bayesian
# estimate" below.
my $VOTES_THRESHOLD = 25;

my %aliases = ('AmigaOS' => ['AmigaOS'],
	       'BeOS' => ['BeOS'],
	       'FreeBSD' => ['FreeBSD'],
	       'Linux' => ['Linux'],
	       'Mac OS' => ['Mac OS', 'MacOS'],
	       'Mac OS X' => ['OS X'],
	       'MVS' => ['MVS'],
	       'NetBSD' => ['NetBSD'],
	       'NetWare' => ['NetWare'],
               'OpenBSD' => ['OpenBSD'],
	       'OS/2' => ['OS/2'],
	       'OS/400' => ['OS/400'],
	       'Solaris' => ['Solaris'],
	       'Unix' => ['Unix'],
	       'VMS' => ['VMS', 'OpenVMS'],
	       'Windows' => ['Windows']
	      );

my %synonyms = ('sucks' => ['sucks'],
		'rules' => ['rules', 'rocks']
	       );

###########################################################################

warn "Warning: You have not provided any log file directory on the command line.  No logs will be produced.\n"
  unless defined($ARGV[0]);

$greatest = 1;

foreach my $os (keys(%aliases)) {

  # Nuke some warnings
  $count{$os}{'sucks'} = 0;
  $count{$os}{'rules'} = 0;
  
  foreach my $alias (@{$aliases{$os}}) {
    
    foreach $quality ('sucks', 'rules') {	
      foreach my $synonym (@{$synonyms{$quality}}) {

	$result = get($SEARCH_PREFIX . lc("$alias+$synonym") . $SEARCH_SUFFIX);
        
	if ($result =~ /found\s+([\d,]+)\D+results/i) {
          $raw = $1; 
          $raw =~ s/\D//g;
          $count{$os}{$quality} += $raw + 0;
          print STDERR "$alias $synonym ($os $quality): $raw\n";
        }
      }
      $greatest = $count{$os}{$quality} if $count{$os}{$quality} > $greatest;
    }
  }
  print "\n";
}

die "bad AltaVista, bad, bad " if $greatest == 1;

# print comment for easy conversion
print "<!--\nOSCOUNT ";
print scalar(keys(%aliases)), "\n";
foreach my $os (sort(keys(%aliases))) {
  print "OSRATING $os ";
  print $count{$os}{'sucks'} + 0, " ", $count{$os}{'rules'} + 0, "\n";
}
print "-->";

print qq{<table width="100%" cellspacing="0">};
print qq{\n<tr><th>&nbsp;</th><th width="40%">sucks
	   </th><th width="40%">rules, rocks
	   </th></tr>};

foreach my $os (sort(keys(%aliases))) {
  my $suckage = int (100* $count{$os}{'sucks'}/$greatest);
  my $suck_offset = 100 - $suckage;
  my $ruleage = int (100* $count{$os}{'rules'}/$greatest);
  my $rule_offset = 100 - $ruleage;

  print qq{\n<tr><th align="right" width="20%">$os</th>};
  print '<td width="40%">';

  print qq{<table width="100%" cellspacing="0">};
  print qq{<tr><th width="$suck_offset%" align="right">};

  if ($suck_offset >= 75) {
    print qq{<font color="red">},
    $count{$os}{'sucks'} + 0,
    qq{</font>};
  }

  else {
    print "&nbsp;";
  }

  print "</th>";

  print qq{<th width="$suckage%" bgcolor="red">};

  if ($suckage > 25) {
    print qq{<font color="white">$count{$os}{'sucks'}</font>};
  }

  elsif ($suckage == 0) {
  }

  else {
    print "&nbsp;";
  }

  print "</th></tr></table></td>";


  print '<td width="40%">';

  print qq{<table width="100%" cellspacing="0"><tr>};
  print qq{<th width="$ruleage%" bgcolor="green">};

  if ($ruleage >= 25) {
    print qq{<font color="white">$count{$os}{'rules'}</font>};
  }

  elsif ($ruleage == 0) {
  }

  else {
    print "&nbsp;";
  }

  print "</th>";

  print qq{<th bgcolor="white" align="left" width="$rule_offset%">};

  if ($ruleage < 25) {
    print qq{<font color="green">$count{$os}{'rules'}</font>};
  }

  else {
    print "&nbsp;";
  }

  print "</th></tr></table></td></tr>";

}

print "</table>";

my $date = scalar(gmtime(time()));
print qq{<h3 align="right">Updated $date GMT.</h3>};

# Log the retrieved data for later use by gnuplot

# Has a log file directory name been specified on the command line?
if (defined($ARGV[0])) {
  $logdirectory = $ARGV[0];

  # Make sure that the log file directory exists
  if (! -e $logdirectory) {
    unless (mkdir $logdirectory,0777) {
      die "Error: Unable to create log file directory ($!)";
    }

    warn "Warning: New log file directory $logdirectory created.\n";
  }

  # Make sure that the log file directory is a directory
  die "Error: $logdirectory is not a directory!\n" unless (-d $logdirectory);

  # Find out what OSes already have log files
  foreach my $logfile_name (split /\n/,`ls $logdirectory/*.gnuplot 2> /dev/null`) {

    # Find out what OS the logfile is for by reading the comment
    # on the first line and stripping "# " from it
    open (LOGFILE, $logfile_name)
      or die "Error: Can't open $logfile_name for reading ($!)\n";

    my $os_comment = <LOGFILE>;
    chomp $os_comment;

    close LOGFILE
      or warn "Warning: Couldn't close $logfile_name ($!)";

    (my $os) = ($os_comment =~ /^\# (.+)/)
      or die "Error: The first line of $logfile_name is not on '# OS-name' format";

    die "Error: $os has more than one log file (at least $logfile_name and $logfile{$os})\n"
      if (defined $logfile{$os});
    
    $logfile{$os} = $logfile_name;
  }
  
  # Create log files for operating systems that don't have one already
  foreach my $os (sort(keys(%aliases))) {
    if (! defined $logfile{$os}) {
      my $logfile_name = $os;
      $logfile_name =~ s {[^a-zA-Z0-9]+} {_}g;
      $logfile_name =~ tr/A-Z/a-z/;

      if (-e ($logdirectory . "/" . $logfile_name . ".gnuplot")) {
        my $counter = 1;

        while (-e ($logdirectory . "/" . $logfile_name . $counter . ".gnuplot")) {
          $counter++;
        }

        $logfile_name .= $counter;
      };

      $logfile_name = $logdirectory . "/" . $logfile_name . ".gnuplot";
      $logfile{$os} = $logfile_name;
      
      unless ((system "echo '# $os' > $logfile_name") / 256 == 0) {
        die "Error: Couldn't create file $logfile_name";
      }

      warn "Warning: New log file created for $os ($logfile_name)\n";
    }
  }

  # Calculate popularity percentages for each operating system.
  # This is done using the "true Bayesian estimate" as described
  # at the bottom of the Internet Movie Database's top 250 list
  # ("http://us.imdb.com/top_250_films").  The idea is that to get
  # very high or very low ratings, you have to have a lot of votes.
  # Operating systems with few votes will be pushed towards the middle
  # of the pack.
  #
  # Here's IMDb's description of the formula used:
  #  weighted rank (WR) = (v / (v+m)) x R + (m / (v+m)) x C
  # where:
  #  R = average for the movie (mean) = (Rating)
  #  v = number of votes for the movie = (votes)
  #  m = minimum votes required to be listed
  #  C = the mean vote across the whole report

  # Calculate the average popularity of all operating systems
  my $sucks = 0;
  my $rules = 0;
  
  foreach my $os (sort(keys(%aliases))) {
    $sucks += $count{$os}{'sucks'};
    $rules += $count{$os}{'rules'};
  }

  die "Error: Weird sucks ($sucks) and rules ($rules) totals"
    if ($sucks <= 0 || $rules <= 0);
  
  my $average_popularity = $rules / ($sucks + $rules);

  # Create a datestring understandable by gnuplot
  ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime(time);

  # Avoid some warnings
  undef $sec;
  undef $min; 
  undef $hour; 
  undef $wday; 
  undef $yday; 
  undef $isdst;
  
  $mon++;        # Convert mon to 1-12 instead of 0-11
  $year += 1900; # Convert year to the real year

  my $datestring = $year . "-" . $mon . "-" . $mday;  # YYYY-MM-DD

  # Log the current date + the popularity percentages to the log files
  foreach my $os (sort(keys(%aliases))) {
    # Verify that the date is not already present in the log file
    if (`grep "$datestring " $logfile{$os}`) {
      warn "Warning: $os log file $logfile{$os} already has an entry for today.  Not adding one more.\n";
      next;
    }

    # Calculate the popularity for this operating system
    my $sucks = $count{$os}{'sucks'};
    my $rules = $count{$os}{'rules'};
    my $votes = $sucks + $rules;

    if ($votes <= 0 || ($sucks <= 0 && $rules <= 0)) {
      warn "Warning: Data not available or illegal for $os.  Sucks=$sucks, rules=$rules, votes=$votes";
      next;
    }
    
    my $rating = $rules / ($sucks + $rules);
    
    my $popularity =
      $rating * ($votes / ($votes + $VOTES_THRESHOLD)) +
        $average_popularity * ($VOTES_THRESHOLD / ($votes + $VOTES_THRESHOLD));

    $popularity *= 100;

    die "Error: $os popularity ($popularity) out of bounds"
      if ($popularity < 0 || $popularity > 100.0);

    # Open the operating system's log file for appending
    open (LOGFILE, ">>" . $logfile{$os})
      or die "Error: Can't open $logfile{$os} for appending ($!)";

    print LOGFILE "$datestring $popularity\n";

    close LOGFILE
      or warn "Warning: Couldn't close log file $logfile{$os}";
  }
}

exit 0;
