#!/usr/bin/perl

# ======================================================================
#
# makewhatis
#
# Version 1.0 - 28-JUL-1995
# (c) Steve "Mr. Bassman" Bryant
# spbryant@tardis.soc.staffs.ac.uk (Permanent but not read often)
# itbryant@hpbbi26.bbn.hp.com (Non-permanent, but that's where I am now)
#
#
# Description:
#
# 	This script creates the "whatis" files, used by apropos(1).  It
# requieres zcat(1) to be able to read compressed files; it is assumed
# that zcat can also read gnu-zipped files (if there are any).  It will
# also handle the /usr/man/preformat directory if your system has it.
#
#
# Background:
#
# 	I wrote this because the one I had didn't work properly.  This
# seems to be my usual reason for writing anything these days outside of
# work.  I could have just modified the other, but in re-writing from
# scratch in Perl (instead of sh/awk/sed/sort/whatever), I managed to
# drop the run-time down to a quarter of what it was before (nearly
# 8 mins down to less than 2 on my box).
#
#	I've tested it as thoroughly as I can, but if it still doesn't
# work, let me know and I'll fix it.
#
#
# Usage:
#	makewhatis [-h] [-v] [-u] [-w] [manpath]
#
#
# Copyright:
#
# 	Permission is hereby given to redistribute this program and
# accompanying manual page at no cost to the recipient (media fee
# notwithstanding), providing it is distributed in its entirety.  It
# may not be modified or be included in any commercial product without
# the author's permission.
#
# Warning:
#	This program is provided "as-is", with no warranty whatsoever.
# Use it entirely at your own risk.
#
# ======================================================================


#
# General variables:
#
$update = undef;
$verbose = undef;
$manpath = undef;
$official_manpath = undef;
%done_dirs = ();

umask 022;

select (STDERR); $| = 1;	# Unbuffers output on this stream
select (STDOUT); $| = 1;	# Leave STDOUT as the default !


#
# Ensure the $PATH environment variable contains /bin and /usr/bin:
#
$ENV{'PATH'} .= ":/bin:/usr/bin";


#
# Process command line options
#
while ($arg = shift (@ARGV))
{
    # See if it starts with a dash
    if ($arg =~ /^-/)
    {
	#
	# A dash argument may have more than one letter following it,
	# so we split the argument into an array of letters, remove the
	# dash from it, and process what remains.
	#
	@letters = split ('', $arg);
	shift (@letters);

	foreach $_ (@letters)
	{
	    SWITCH:
	    {
		/h/ && do
		{
		    &print_help;
		    exit;
		};
		/u/ && do
		{
		    $update = 1;
		    last SWITCH;
		};
		/v/ && do
		{
		    $verbose = 1;
		    last SWITCH;
		};
		/w/ && do
		{
		    next		# Don't bother including it twice
			if ($official_manpath);	

		    if ($official_manpath = `man -w`)
		    {
			chomp $official_manpath;	# Remove \n from end

			$manpath .= ":"
			    if ($manpath);

			$manpath .= $official_manpath;
		    }
		    else
		    {
			warn "$0: can't execute 'man -w'\n";
		    }
		    last SWITCH;
		};
		/c/ && last SWITCH;	# Ignored on purpose


		#
		# Default case here:
		#
		die "$0: unknown option: $arg\n";
	    }
	}
    }
    else
    {
	# Add the argument to the manpath (with colon if necessary).
	$manpath .= ":"
	    if ($manpath);

	$manpath .= $arg;
    }
}


#
# If the user hasn't specified a manpath or set the -w switch, then we
# use the contents of the $MANPATH environment variable.
#
$manpath = $ENV{'MANPATH'}
    if (!$manpath);



#
# Now, we cycle through each element in the manpath, and create a
# whatis file for it.
#
@elements = split (/:/, $manpath);
foreach $directory (@elements)
{
    &make_whatis ($directory);
}


#
# Now a little hack for /usr/man/preformat - comment this out if your
# system simply uses the cat directories.
#
&do_preformat_hack
    if ($manpath =~ /(^|:)\/usr\/man(\/preformat)?(:|$)/);


exit;



# ======================================================================
#
# Subroutines
#
# ======================================================================


#
# Print out a brief help message
#
sub print_help
{
    print <<EOT;
usage: makewhatis [-h] [-u] [-v] [-w] [manpath]
options:
  -h   print this help and exit.
  -u   update whatis database with newer man pages only.
  -v   verbose
  -w   use `man -w` to obtain the manpath.  This may be used in
       addition to a user-specified manpath.

    If a manpath is specified, the the contents of that path will be
processed.  If none is specified, then the contents of \$MANPATH will
be processed unless the -w switch is set.
EOT

}



#
# This routine processes the specified directory and creates (or updates)
# a file called "whatis" in the directory.
#
sub make_whatis
{
    local ($directory) = @_;

    #
    # See if we've already done this directory...
    #
    if ($done_dirs{$directory})
    {
	warn "$0: already processed directory: $directory\n"
	    if ($verbose);

	return;
    }
    else
    {
	$done_dirs{$directory} = 1;
    }


    #
    # Make sure we can cd to the directory
    #
    unless (chdir ($directory))
    {
	warn "$0: can't chdir to \"$directory\": $!\n";
	return;
    }


    print "Processing directory: $directory\n"
	if ($verbose);


    #
    # Initialise the "whatis" array - all the entries are stored here and
    # written out to file at the end.
    #
    %whatis = ();


    #
    # If this is only an update, the we load in the existing whatis file for
    # modification - usually it would just be overwritten.
    #
    if ($update)
    {
	$changed = undef;
	$whatis_mtime = 0;

	if (-e "whatis")
	{
	    if (open (WHATIS, "<whatis"))
	    {
		while ($line = <WHATIS>)
		{
		    chop $line;
		    $whatis{$line} = 1;
		}
		close (WHATIS);

		$whatis_mtime = &mtime ("whatis");	# No fail check
	    }
	    else
	    {
		warn "$0: can't open $directory/whatis: $!\n";
		return;
	    }
	}
    }


    #
    # Now we have to loop through each of the subdirectories and process their
    # contents.  Not all subdirectories will be processed - ones which match
    # the pattern man[0-9a-z] will be processed as nroff/groff source manual
    # pages, and cat[0-9a-z] directories will be processed as plain text
    # manual pages.  When I can be bothered, I'll put in a bit that deals
    # with [man|cat].[0-9a-z].Z (ie: specifically compressed directories).
    # Subdirectories that are links are ignored.
    #

    unless (opendir (CURRENT_DIR, "."))
    {
	warn "$0: can't read directory \"$directory\": $!\n";
	return;
    }


    SUBDIR:
    while ($subdir = readdir (CURRENT_DIR))
    {
	#
	# Ignore links
	#
	next
	    if (-l $subdir);


	#
	# Ignore non-man subdirs
	#
	next
	    if ($subdir !~ /^(man|cat)[0-9a-z](\.Z)?$/);


	#
	# Ignore non-directories
	#
	if (! -d $subdir)
	{
	    warn "$0: $directory/$subdir is not a dirctory\n"
		if ($verbose);
	    next;
	}


	#
	# If we're updating only, check whether the directory has a newer
	# modification time than the whatis file.
	#
	next
	    if ($update && (&mtime ($subdir) < $whatis_mtime));


	print "Processing: $subdir\n"
	    if ($verbose);


	#
	# Set up some info about the subdir we're processing...
	#
	$section = substr ($subdir, 3, 1);	# Assumes 1 char only
	$type = "m";
	$type = "c"
	    if (substr ($subdir, 0, 3) eq "cat");
	$compressed_dir = undef;
	$compressed_dir = 1
	    if ($subdir =~ /.Z$/);


	#
	# Process the files in the subdirectory...
	#
	unless (opendir (SUBDIR, $subdir))
	{
	    warn "$0: can't read directory \"$directory/$subdir\": $!\n";
	    next;
	}

	FILE:
	while ($filename = readdir (SUBDIR))
	{
	    #
	    # Ignore "." and ".."
	    #
	    next
		if (($filename eq ".") || ($filename eq ".."));


	    #
	    # If this is only an update, ignore files older that the
	    # whatis file.
	    #
	    next
		if ($update && (&mtime ("$subdir/$filename") < $whatis_mtime));


	    #
	    # Work out what section this manpage thinks it's in.
	    #
	    @name_bits = split (/\./, $filename);
	    pop (@name_bits)			# Remove 'Z' or 'gz' from end.
		if ($name_bits[$#name_bits] =~ /(Z|z|gz)/);
	    $manpage_section = pop (@name_bits);


	    if (substr ($manpage_section, 0, 1) ne $section)	# Wrong section
	    {
		warn "$0: $directory/$subdir/$filename seems to be in the " .
		    "wrong section !\n";
		next;
	    }


	    print "Adding: $filename\n"
		if ($verbose);


	    #
	    # Now we have to read the file (it may be compressed), and
	    # determine the "synopsis" of the man page (it may be pre-
	    # formatted).
	    #
	    if ($compressed_dir || ($filename =~ /.(Z|z|gz)$/))
	    {
		unless ($pid = open (MANPAGE, "zcat $subdir/$filename|"))
		{
		    warn "$0: can't open $directory/$subdir/$filename: $!\n";
		    next;
		}
	    }
	    else
	    {
		$pid = undef;
		unless (open (MANPAGE, "<$subdir/$filename"))
		{
		    warn "$0: can't open $directory/$subdir/$filename: $!\n";
		    next;
		}
	    }



	    #
	    # Get the NAME info from the manpage
	    #
	    if ($type eq "m")
	    {
		$name = &process_source_page;
	    }
	    else
	    {
		$name = &process_ascii_page;
	    }


	    #
	    # Close the file
	    #
	    kill 9, $pid	# Avoids zcat complaining about broken pipes
		if ($pid);
	    close (MANPAGE);


	    #
	    # Do some processing on the manpage info (if one was returned).
	    #
	    if ($name)
	    {
		$changed = 1
		    if ($update);


		$name =~ s/--/ - /;	# Change "--" to " - "
		$name =~ s/:/ - /	# Change ":" to " - "
		    if ($name !~ /-/);	# if they didn't use a dash
		$name =~ s/-/ - /	# Fix old cat pages
		    if ($name !~ / - /);
		$name =~ tr/\t/ /;	# Change tabs to spaces
		$name =~ s/\s+/ /g;	# Collapse whitespace
		$name =~ s/^ //;	# Delete whitespace from start of line
		$name =~ s/ $//;	# Delete whitespace from end of line
		$name =~ s/ , /, /g;	# Fix comma spacing


		#
		# We now format the line - we split it into "name" and
		# "description", and space them out nicely.
		#
		@name_bits = split (/\s+-\s+/, $name, 2);

		$name = "$name_bits[0] ($manpage_section) ";
		while (length ($name) < 21)
		{
		    $name .= " ";
		}

		$name .= "- $name_bits[1]";

		$whatis{$name} = 1;
	    }
	}

	closedir (SUBDIR);
    }


    closedir (CURRENT_DIR);


    #
    # At this point, we have an array called %whatis which contains all
    # the database entries.  These simply have to be sorted and written
    # to the whatis file.
    #
    unless (open (WHATIS, ">whatis"))
    {
	warn "$0: can't open $directory/whatis: $!\n";
	return undef;
    }

    foreach $name (sort keys %whatis)
    {
	print WHATIS "$name\n";
    }

    close (WHATIS);
}



#
# This routine returns the modification time of the given filename,
# or undefined if the stat fails.
#
sub mtime
{
    local ($filename) = @_;

    unless (local ($mtime) = (stat ($filename))[9])
    {
	warn "$0: can't stat $directory/$filename: $!\n";
	return undef;
    }

    return $mtime;
}



#
# This routine processes a "source" manual page.
# It looks for a section called "NAME", as denoted by the ".SH" keyword,
# and returns the first line of this section (this section is usually only
# one line long anyhow).
# If no such section is found, it return undefined.
#
sub process_source_page
{
    local ($line, $name);	# Local variables


    while ($line = <MANPAGE>)
    {
	chop $line;		# Remove trailing \n

	$line =~ s/^\s*//g;	# Remove whitespace from start of line
	$line =~ tr/a-z/A-Z/;	# Convert to upper case


	if ($line =~ /^\.SH\s+NAME/)	# Found .SH NAME
	{
	    $name = "";

	    while ($line = <MANPAGE>)	# Read next line
	    {
		chop $line;		# Remove trailing \n
		$line =~ s/\\-/-/;	# Change "\-" to "-".
		$line =~ s/^\s+//;	# Remove whitespace from start of line
		$line =~ s/\s+$//;	# Remove whitespace from end of line


		#
		# If the line just read is the next section or is blank,
		# then we're done and can return what we've found.
		#
		return $name
		    if (($line eq "") || ($line =~ /^\.[Ss][Hh]/));


		#
		# Is there are any more nroff commands, they must either
		# be removed or interpreted.
		#
		if ($line =~ /^\./)	# Dot commands at start of line
		{
		    $line =~ s/^\.([IB]|Nm|Tn|Li|Dq|Nd)//;	# Remove some
		    $line =~ s/^\.\\\"*//;		# Remove comments


		    #
		    # I'm not sure what these are, but they appear in the
		    # Motif man pages (but there's no mention of them in
		    # my gtroff man page).  All the same, it seems the best
		    # course of action to delete them, along with any
		    # arguments they may have.
		    #
		    $line =~ s/^\.(iX|PP|z[AZ]).*$//;
		}


		if ($line =~ /\\/)	# Inline commands
		{
		    #
		    # Remove font settings and string interpolation
		    #
		    $line =~ s/\\[f\*]\(..//g;
		    $line =~ s/\\f.//g;
		    $line =~ s/\\\*.//g;



		    #
		    # Remove font sizings
		    #
		    $line =~ s/\\s('?)[+-]?\d+$1//g;
		    $line =~ s/\\s\(\d\d//g;


		    #
		    # Some interpretaions
		    #
		    $line =~ s/\\\(em/--/g;
		    $line =~ s/\\\(hy/-/g;
		    $line =~ s/\\\(ul/_/g;


		    $line =~ tr/\\//d;	# Delete any remaining backslashes
		}


		#
		# Add a space on the end if the line's not blank, and
		# if it doesn't end with a dash.
		#
		$line .= " "
		    if ($line && ($line !~ /\S-$/));


		#
		# Add this line to the name, and delete hyphenation.
		#
		$name =~ s/\S-$//
		    if ($line);
		$name .= $line;
	    }


	    #
	    # If we reached the end of the file, return what we found;
	    # it may be undefined (which is ok).
	    #
	    return $line;
	}
    }


    #
    # Reached EOF
    #
    return undef;
}



#
# This routine processes a "cat" man-page, looking for the same as the above
# routine.  However, we often have to contend with backspace characters in
# these files, as they are used to do underlining and emboldening.
#
sub process_ascii_page
{
    local ($line, $name);	# Local variables


    while ($line = <MANPAGE>)
    {
	chop $line;		# Remove trailing \n

	$line =~ s/^\s*//g;	# Remove whitespace from start of line
	$line =~ s/.\010//g;	# Remove the "backspace effect"

	if ($line =~ /^NAME/)
	{
	    $name = "";

	    while ($line = <MANPAGE>)
	    {
		chop $line;
		$line =~ s/.\010//g;	# Remove the "backspace effect"
		$line =~ s/^\s*//;	# Remove whitespace from the start


		#
		# We keep going until we get a blank line
		#
		return $name
		    if ($line eq "");

		#
		# Add a space on the end if the line's not blank, and
		# if it doesn't end with a dash.
		#
		$line .= " "
		    if ($line && ($line !~ /\S-$/));


		#
		# Add this line to the name, and delete hyphenation.
		#
		$name =~ s/\S-$//
		    if ($line);
		$name .= $line;
	    }


	    #
	    # Return what we've found
	    #
	    return $name;
	}
    }

    return undef;
}



#
# A hack for the /usr/man/preformat directory found on many Linux systems.
#
# This directory doesn't usually have its own whatis database; the man
# won't usually look there.  What this program does is to create a whatis
# database for /usr/man/preformat, and then insert its contents into
# /usr/man/whatis.  It doesn't delete /usr/man/preformat/whatis so that
# future updates can be done more quickly.
#
sub do_preformat_hack
{
    $directory = "/usr/man";
    $subdir = "preformat";


    #
    # Firstly - is it there ?
    #
    if (! -d "$directory/$subdir")
    {
	warn "$0: can't incorporate whatis database from /usr/man/preformat: "
	    . "no such directory\n"
	    if ($verbose);
    }


    unless (chdir ($directory))
    {
	warn "$0: can't cd to $directory: $!\n"
	    if ($verbose);	# This error will also appear earlier

	return undef;
    }


    return undef
	unless (-e "$subdir/whatis");	# No whatis file


    return undef
	unless (-e "whatis");



    #
    # If we're in "update" mode, it may not be necessary to go through
    # with this routine...
    #
    return undef
	if ($update && (&mtime ("$subdir/whatis") < &mtime ("whatis")));



    #
    # Load the preformat whatis database into the %whatis array
    #
    unless (open (PREFORMAT, "<$subdir/whatis"))
    {
	warn "$0: can't open $directory/$subdir/$whatis: $!\n";
	return undef;
    }

    %whatis = ();

    while ($line = <PREFORMAT>)
    {
	chop $line;		# Remove trailing \n
	$whatis{$line} = 1;
    }

    close (PREFORMAT);


    #
    # Now we load in the main whatis database into the same array (which
    # will eliminate duplicates).
    #
    unless (open (WHATIS, "<whatis"))
    {
	warn "$0: can't open $directory/whatis: $!\n";
	return undef;
    }

    while ($line = <WHATIS>)
    {
	chop $line;		# Remove trailing \n
	$whatis{$line} = 1;
    }

    close (WHATIS);


    #
    # Now we sort the entries and write them out to the one file.
    #
    unless (open (WHATIS, ">whatis"))
    {
	warn "$0: can't open $directory/whatis: $!\n";
	return undef;
    }

    foreach $line (sort keys %whatis)
    {
	print WHATIS "$line\n";
    }

    close (WHATIS);
}

# ======================================================================
#
# End of file: makewhatis
#
# ======================================================================
