#!/usr/bin/perl -w

#
# copyright notice: copyright 2005,2007,2008 David A Thompson
#

#
# statement of copying permission:
#
# this program is distributed under the terms of the GNU 
# General Public License (or the Lesser GPL) and can be 
# distributed under the GNU GPL version 2 or later
#

#
# otlsub [OPTIONS] filename(s)
#	- takes straight text in filename
#	- and makes substitutions as specified in the substitution table
#	- substitution table is __DATA__ section at end of program (example table is for prettifying documents with chemistry stuff in them)
#		- items are TAB-separated
#			- so space can be included in search string
#

# OPTIONS
#   --debug	- show debugging information
#	--descend - move recursively through all subdirectories
#	--help - show help message
#	--nolinkadjust - don't try and adjust relative links to files
#	--rb	- backup (create a copy of the original file) and then replace the current file


#	examples:
#		H+ should be H<sup>+</sup>
#
#	items excluded:
#		pi - could be inorganic phosphate or number pi
#
#	notes:
#		using regex search so h+ isn't good ( h\+ is)



# 
# define internal variables
#
# - everything must be internal - otherwise problem if 
#   called from another perl routine
my $assumeyes=0; # query user
my $avlenth;

my $count;
my $currentDirectory;

my $debug;
my $descend;

my $directorylevel;

my $file;
my @filenames;
my @flag;
my $flagl;
my $fnl;

my @help;

my $i;

my $j;

my $k;

my $linkadjust;

my @newfilenames;

my @otlsubtable;

my $paramfile;
my $path;

my $replaceANDbackup;

my $zz;

#
# modules to import
#
use File::Find;			# for recursive directory descent
use File::Copy;			# for making quick copy of file


########################
#
# process command line
#
########################
#
# use @flag to hold all flags
# flagl = flag array length -1
# @filenames - used to hold all file names in command line
# fnl = number of elements in @filenames -1

$avlength=@ARGV-1;

# scan ARGV first for "--joebob" variables and put them in @flag
# and put other ARGV elements in @fn
$j=0;
$k=0;
for $i (0 .. $avlength)
{
	#print "argv-$i is ->$ARGV[$i]<-\n";
    if ( $ARGV[$i] =~ /--\w+?/ )
    {
	$flag[$j]=$ARGV[$i];
	$j++;
    }
    else
    {
	$filenames[$k]=$ARGV[$i];
	$k++;
    }
}
$flagl=$j-1;


#
# is --debug passed?
#
$debug=0;
for $i (0 .. $flagl)
{
    if ( $flag[$i] && $flag[$i] =~ m/--debug/)
    { 
	$debug=1;
	last;
    }
}
if ($debug) {
        print "flags are @flag\nfilenames are @filenames\n";
	print "debugging on\n";
}


#
# is --help passed?
#
for $i (0 .. $flagl)
{
    if ( $flag[$i] && $flag[$i] =~ m/--help/)
    { 
	@help = "\notlsub [--assume-yes] [--help ][--debug] [--descend] [--nolinkadjust] filename1 [filename2] ... [filename n]\n\n\nsee documentation distributed with package for additional information\n\n";
	print @help;
	exit;
    }
}

#
# is --descend passed?
#
$descend=0;
for $i (0 .. $flagl)
{
    if ( $flag[$i] && $flag[$i] =~ m/--descend/)
    { 
	$descend=1;
    }
    if (  $flag[$i] && $flag[$i] =~ m/--assume-yes/)
    { 
	$assumeyes=1;
    }
}

#
# is --rb passed?
#
$replaceANDbackup=0;
for $i (0 .. $flagl)
{
    if ( $flag[$i] && $flag[$i] =~ m/--rb/)
    { 
	$replaceANDbackup=1;
    }
}

#
# is --nolinkadjust passed?
#	- on by default
$linkadjust=1;
for $i (0 .. $flagl)
{
    if ( $flag[$i] =~ m/--nolinkadjust/)
    { 
		$linkadjust=0;
		if ($debug) { print "linkadjust is set to $linkadjust\n"; }
		last;
    }
		
}

####
#
# deal with parameter file
#	- only use ~/.otlsub for the substitution table at this point
#
####
$paramfile=0;	# we're not passing a paramfile argument for this script
unless ($paramfile)
{
	# if no paramfile argument, define paramfile to look for
	$path=$ENV{'HOME'};
	$paramfile='.otlsub';
	$paramfile="$path/$paramfile";
	if ($debug) { print "No parameter file given in command-line so using paramfile ->$paramfile<-\n"; }
}

#
# open parameter file if it exists and put it in @t2h array
#	- otherwise create default at ~/.otlsub
#
if ( open(INFO,$paramfile) )
{
	    if ($debug) { print "...using parameter file $paramfile \n"; }
	    open(INFO,$paramfile);
	    @otlsubtable=<INFO>;
	    close(INFO);
}
else
{
	if ($debug) { print "no .otlsub file found...writing default to ~/.otlsub\n"; }
	# create @otlsubtable array
	@otlsubtable=<DATA>;

#	hash %ENV contains current environment - use it to get home directory
	$path=$ENV{'HOME'};
	$file='.otlsub';
	$file="$path/$file";
	open(FOUT, ">$file");
	flock(FOUT,2);
	print FOUT @otlsubtable;
	close (FOUT);
}



###
#
# process files
#
###

$directorylevel=0;		# only change if we move down to new directory 
						# (only if --descend is flag)

# if --descend is on, we construct a new @filenames by recursing through
# the directories of interest
if ($descend)
{
	# imports function which lets us do recursive search through directory tree
	#	find ( \&subroutine, "directory name")
	#		- executes subroutine in each directory
	
	$currentDirectory=0;	# need to evaluate first directory we hit
	@newfilenames=();	# initialize new filename holder
	find(\&matching_filename, ".");
	# debugging
	#die @newfilenames;
	@filenames=@newfilenames;
}

#
# process filenames
#
if ($debug)
{
	print "\nFilenames to process: @filenames\n";
}
$fnl=@filenames;
unless ($filenames[0])
{
	print "Please supply a legitimate filename for otlsub to process\n";
	die;
}
for $zz (1 .. $fnl)
{
	$filename=$filenames[$zz-1];
	
	# set $directorylevel
	#	- one "/" character corresponds to current directory
	#	- two "/" characters corresponds to one directory down
	$count = $filename =~ s/\//\//g;	# counting only works with substitution
	$directorylevel=$count-1;
	if ($debug)
	{
		print "Processing $filename (directorylevel is $directorylevel\n";
	}
	
	
	# process file
	PROCESS($filename);
}


sub matching_filename
# names at $_ (from File::Find) correspond to names being evaluated for
# a match against @filenames
#
# - puts results in @newfilenames (variable of routine
#	calling this subroutine
#
{
	my $currentname;
	my $currentFullName;
	
	my @filenamesFinal;
	
	my @globbed;
		
	$currentname=$_;	# this is the current name File::Find has given us
	
	# every time we enter a new directory we need to see if @filenames needs
	# to be globbed
	if ($File::Find::dir ne $currentDirectory)
	{
		# we've entered a new directory
		@filenamesFinal=();		# array with final list of names to check
								# against								
		# we entered a new directory - check to see if @filenames needs globbing
		foreach (@filenames)
		{
			if ($_ =~ /[\*\$]/ )
			{
				@globbed=<$_>;
				foreach (@globbed)
				{
					push(@filenamesFinal,$_);
				}
			}
			else
			{
				push(@filenamesFinal,$_);
			}
		}
	}
	# now that we have a list of filenames to check against, let's see if
	# any files in the current directory match
	$currentFullName=$File::Find::name;
	for (@filenamesFinal)
	{
		if ( $_ eq $currentname) 
		{
			push(@newfilenames,$currentFullName);
		}
	}
}


####################################################
####################################################
#
# start of routine which is called for each file
#
####################################################
####################################################
#
# order of events:
#	- open file and put into array
#	- look for tags that match list

sub PROCESS {

# initialize all local variables
my @lines=();
my @linesout=();


#
# open file-to-process and put file content in array lines
#

# get info from @filein array now...
if ($debug) { print "at sub is ->@_<-\n"; }

$filein=shift(@_);

if ($debug) { print "filein is ->$filein<-\n"; }
# replace any suffix with .out unless --rb specified
if ( $filein =~ /(.+)\..*/ && $replaceANDbackup==0)	
{
	$fileout="$1.out";
# 	print "match is --> $1 <--\n\n";
# 	print "new filename is $filein\n";
}
elsif ($replaceANDbackup==0)
{
	$fileout="$filein.out";
}
else
{
	$fileout=$filein;
}

#
# open file-to-process and put file content in array lines
#
die "Can't open file name ->$filein<-" unless open(INFO,$filein);

if ($debug) { print "\nOpening $filein\n"; }
open(INFO,$filein);
@lines=<INFO>;
close(INFO);

#
# figure out what filein looks like
#
$lineslength=@lines;							#

#
# create a duplicate array (linesout) that will be
# the output
#
for ( $i=0; $i < $lineslength; $i++)
{
	$linesout[$i]=$lines[$i];
}

#if ($debug)
#{ print "\nGenerated linesout, linesout is: @linesout\n"; }


###################################
#
# PROCESS TEXT FILE-TO-PROCESS
#
###################################

# need to rewrite and reread file so that \n's are now processed correctly
# and each line ends up as a separate array element
# -- write it to temp file
unlink("/tmp/tmp.otl");
open(FOUT, ">/tmp/tmp.otl");
flock(FOUT,2);
print FOUT @linesout;
close (FOUT);
# now read it
@linesout=();
open(INFO,"/tmp/tmp.otl");
@linesout=<INFO>;
close(INFO);

#
# $llength is length of linesout
#
$llength=@linesout;
#if ($debug) 
#{	print "\nlinesout after rewrite is $llength lines long:\n";
#	for ($i=0;$i<$llength;$i++) 
#	{
#		print " line $i ->$linesout[$i]<-\n"; 
#	}
#}

#
# @otlsubtable array contains substitutions to make
#
$otlsubtableLength=@otlsubtable;

if ($debug)
{ print "otlsubtable array is @otlsubtable\n"; }
#
# check each line for items in array
#
for ($i=0; $i<$llength; $i++)
{
	if ($debug) { print "checking line $i ->$linesout[$i]<-\n"; }
	for ($j=0; $j<$otlsubtableLength; $j++)
	{
		# get item to search for
		if ( $otlsubtable[$j] =~ m/^([\S ]+?)\t/ )
		{
			$search=$1;
			if ($debug) { print "\tsearch is $search\n"; }
		}
		else
		{
			print "error in search column of __DATA__ format on line $j; skipping line $j";
		}
		
		# get item to substitute
		if ( $otlsubtable[$j] =~ m/^[\S ]+?\t+?\|\|(.+?)\|\|/ )
		{
			$substitute=$1;
			#if ($debug) { print "\tsubstitute is $substitute\n"; }

		}
		else
		{
			print "error in substitute column of __DATA__ format";
			die;
		}
		
		# check if item to substitute contains a hyperlink with an apparent
		# reference to a local file and adjust for depth unless
		# --nolinkadjust ($linkadjust=0) is set
		if ($linkadjust==1)
		{
			# check if there is an http: or other specifier in a <a> tag 
			unless ($substitute =~ m/<a.*?\"[a-zA-Z]*?\:/)
			{
				# check if there's an <a> tag with a likely local file reference
				if ($substitute =~ m/<a.*?\"/)
				{
					if ($debug)
					{ 
						print "\n\nmodifying for descend: substitute is ->$substitute<-";
						print "\n\tdirectorylevel is ->$directorylevel<-";
					}
					
					$x=$directorylevel;
					while ($x != 0)
					{
						# add "../" as needed
						# $directorylevel is integer corresponding to # of
						#	directories below initial directory (0)
						$substitute =~ s/(<a.*?\")([^\"]*?\")/$1\.\.\/$2/g;
						$x=$x-1
					}
						
					if ($debug)
					{ print "\n\tmodified for descend: substitute is ->$substitute<-"; }
				}
			}
		}
		# make substitution
		#if ($debug) { print "\tline out is $linesout[$i]\n"; }
		$linesout[$i] =~ s/$search/$substitute/g;
		#if ($debug) { print "\tnew line out is $linesout[$i]\n"; }

	}
}


#
# OUTPUT THE OUTPUT FILE WHEN FINISHED
#
#

# check if fileout is already present
if ($debug) { print "Fileout:", $fileout; }
$ow = "n";

if (-e "$fileout")
{
	if ($replaceANDbackup==0)
	{
	  # this check isn't necessary
	  # ** lazy fix; recode and comment out if there are other areas
	  # where queries should be made
	  $assumeyes = 1;

	  if ($assumeyes==0)
	    {# backup not specified so check to see if overwrite is okay
	      print "\notlsub:  Overwrite file $fileout [default is n]? ";
	      $ow=<STDIN>;
	      chomp ( $ow );		# remove \n at last line
	    }
	  else 
	    { $ow="y" }
	  unless ($ow =~ "y")
	    {
	      print "File not overwritten\n";
	      die;
	    }
	}
	else
	  {
	    # backup specified so make copy of file to be overwritten first
	    # - use copy function from File::Copy module
	    copy("$fileout","$fileout.otlbak");
	  }
      }
else
{
	if ($debug) { print "File overwritten\n"; }
}

open(FOUT, ">$fileout");
flock(FOUT,2);
print FOUT @linesout;
if ($debug) { print "Output file is $fileout\n"; }
close (FOUT);
}



########## END SUBROUTINE ############

# should alphabetize stuff below 

#format:
# item to search for is in first column and is ended with a tab character
# item to replace with is in second column and is preceded and succeeded by <<

__DATA__
[cC]a2\+	||Ca<sup>2+</sup>||
[cC]a\+\+	||Ca<sup>++</sup>||
[cC][oO]2	||CO<sub>2</sub>oO]2||
[cC]u2\+	||Cu<sup>2+</sup>||
\x20e\-\s		|| e<sup>-</sup>||
(Eo)\S		||E<sub>o</sub>||
[hH]\+		||H<sup>+</sup>||
[hH]2		||H<sub>2</sub>||
[hH]3		||H<sub>3</sub>||
HPO4		||HPO<sub>4</sub>||
H2PO4		||H<sub>2</sub>PO<sub>4</sub>||
[p\s]Ka\s			||K<sub>a</sub>||
\sKeq\s			||K<sub>eq</sub>||
[nN][Aa][Dd]\+	||NAD<sup>+</sup>||
Na\+		||Na<sup>+</sup>||
Na[23]		||Na<sub>2</sub>||
[mM]g2\+	||Mg<sup>2+</sup>||
[mM][Gg]\+\+	||Mg<sup>++</sup>||
[fF]e2\+	||Fe<sup>2+</sup>||
[fF]e3\+	||Fe<sup>3+</sup>||
[Oo]2		||O<sub>2</sub>||
oC\s			||<sup>o</sup>C||
[Rr]ed\+	||Red<sup>+</sup>||
\ssp3			||sp<sup>3</sup>||
\ssp2			||sp<sup>2</sup>||
\st1/2		||t<sub>1/2</sub>||
[Oo]x\-		||Ox<sup>-</sup>||
\[\]		||&#x2610;||
