# Copyright (c) 1996, 1997  The TERENA Association
# Copyright (c) 1998, 1999  RIPE NCC
#
# All Rights Reserved
#
# Permission to use, copy, modify, and distribute this software and its
# documentation for any purpose and without fee is hereby granted,
# provided that the above copyright notice appear in all copies and that
# both that copyright notice and this permission notice appear in
# supporting documentation, and that the name of the author not be
# used in advertising or publicity pertaining to distribution of the
# software without specific, written prior permission.
#
# THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
# ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS; IN NO EVENT SHALL
# AUTHOR BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
#------------------------------------------------------------------------------
#
# Filename          :   Extract.pm
# Purpose           :   Provide routines to analyse a ripe-141 document into an array
#                       of (pointers to) hashes
# Author            :   Lee Wilmot
# Date              :   971112
# Language Version  :   Perl 5, version 5.003_07
# OSs Tested        :   BSD
# Command Line      :   Nothing executable from the command line
# Input Files       :   STDIN
# Output Files      :   $RobotConfig::FROMCUST_MAIL_FILE, $RobotConfig::FROMCUST_LOG_DIR
# External Programs :   NONE
# Problems          :   1) Info BEFORE $TEMPLATE_START_REG on a line will be lost.
#                       Thus template headers must be on a line by themselves.
#                       2) If you have 2 fields on the same line, only the first 
#                       will be recognized. The second will be taken as part of 
#                       the value for the first.
# To Do             :
# Comments          :   
# Description       :
#
# 1: Write STDIN to $FROMCUST_MAIL_FILE
# 2: Write $FROMCUST_MAIL_FILE to end of $FROMCUST_LOGFILE.
# 3: Pass over $FROMCUST_MAIL_FILE, reading each template contents into a hash,
# using the appropriate extraction routine, determined by the name of the
# template encountered. Three kinds of template are distinguished, each 
# having a different extraction routine:
#
# 1) A text template. Consists of only lines of text
# 2) A plan template. Consists of lines, each holding a subnet and relevant data,
# plus possibly a totals line at the end.
# 3) A field template. Consists of lines with rough form 'field: value'
#
# A reference to each hash extracted is added to the end of @TEMPLATES.
#
# The first template extracted is a 'meta template'. Throughout the 
# extraction/analysis/report this template is used to store meta info (info
# which is not related to the content of the request, but how it is to be
# processed, such as who sent the mail, problems which are not template-specific etc)
#
# MGTM: extract() loses first $_ which is OK for emails as it is only
# From blah, but means we need a 'From ' line for web submitted text.
# We add this to web submitted texts in the file copy :-(
#

############## FIELD NAMES ADDED TO TEMPLATES ##################
# 
# For the field names which these symbols translate to, see %RobotConfig::F
#
# $F{TNAME}
#
# The string which identifies the templates. e.g. "OVERVIEW OF ORGANIZATION TEMPLATE"
# If this field name is also used on the form, it's value is ignored.
#
# $F{REMAINDER}
#
# When analysing a template, any lines found which do not match an expected
# format are placed in a field with key REMAINDER. Analysis of these could
# prove revealing at a later stage.
# 
# $F{PROBLEMS}
#
# This field may be added by the &Check::prob function, only in the event that
# some lines in a plan template had to be combined. 
# This is not good in terms of logical flow and modularity. On the other hand,
# it's very useful to be able to inform the user that this has occured: see
# 'COMBINED_SOME_LINES_ON_PLAN' in the $PROBLEMTEXT_FILE.
#
# $F{FIELD_ORDER}
#
# Used to ensure that fields in FIELD type templates are shown to the
# user in the report in the the same order as they appear on the form.
# Unfortunately, Perl returns fields using 'keys' in an 'apparently
# random order', to quote the book.

############################ PACKAGE INTERFACE ###############################
#
# Please see the package concerned for descriptions of imported symbols.

package Extract;

use strict;

BEGIN {
    use vars qw( @ISA @EXPORT_OK @TEMPLATES);

    use Exporter ();
    
    @ISA = qw( Exporter );

    @EXPORT_OK = qw(
        &extract @TEMPLATES
    );
}

# MGTM added, to decide if should show a given problem: REPORT_TYPE_FLAG,
use RobotConfig qw(
        :REGULAR_EXPRESSIONS :DEBUG_FLAGS

        %F $FORM_FS $PLAN_FS

        $FROMCUST_MAIL_FILE $FROMCUST_LOG_DIR

        %REPORT_TYPE $REPORT_TYPE_FLAG

        $TEMPLATE_NAME_EXTENSION
        $NOT_USE_AS_DESCR_REG
);

use Misc qw(
        &make_filename &add_to_log &fatal 
        &dprint &preprocess_line &strip_space
        &add_to_field &is_internal_field_name
);

use TemplateAux qw(
        &get_template_key &template_name &template_type
        &is_template_name &is_db_template &correct_template_name_misspellings
        &rough_show_template
        &looks_like_possible_corrupt_template_header 
);

use Check qw(
        &prob
);

############################ PACKAGE BODY ##############################

# Purpose       :       Main routine for template extraction
# In            :       VOID
# Out           :       VOID
#
sub extract {

    &dprint("****** EXTRACT *********");

    my $got_meta_flag = 0;                      # only extract meta template once

    # obtain unique filename    

    my $fromcust_file = &make_filename($FROMCUST_MAIL_FILE);

    # write STDIN to a temporary file

    open ( FROMCUST, ">$fromcust_file" ) or 
        &fatal("$! while opening $fromcust_file for writing STDIN to.");

    # MGTM if web submission, add artificial 'From ' line
    print FROMCUST "From Webpage\nTo: autohm.pl\n"
        if ( $REPORT_TYPE_FLAG == $REPORT_TYPE{HTML} );

    print FROMCUST
        while (<>);

    close FROMCUST ||
        &fatal("$! while closing pipe from $fromcust_file.");

    # log the message

    &add_to_log($FROMCUST_LOG_DIR, $fromcust_file);

    ### reopen temporary file for processing ###

    open ( TEMP, "$fromcust_file" ) or              
        &fatal("$! while opening $fromcust_file for reading.");
    
    # extract data structures from the tempfile
    # we miss the very first line, but this is
    # not a mail header line so it's OK

    my $seen_last_template_flag = 0;

  MAINLOOP:                             
    while ( <TEMP> ) {

        # There's a possible use of uninitialised value if the input runs
        # out just before one of the redo's below. This avoids it. 

        last if ( ! defined $_ );

        # First template to extract is meta template. We extract 
        # the fields of the mail header into it. Only once.

        if ( ! $got_meta_flag ) {

            &extract_field_template( &template_name('T_META') );
            $got_meta_flag = 1;

            $TEMPLATES[0] { $F{EMAILBODY} } = $TEMPLATES[0] { $F{REMAINDER} } || "";

            # look for the next template
            redo MAINLOOP; 
        }

        $TEMPLATES[0] { $F{EMAILBODY} } .= $_;      # Add line to 'body' field in meta template

        $seen_last_template_flag = 1
            if /$LAST_TEMPLATE_REG/i;               # don't process anymore templates
                                                    # once we've seen $LAST_TEMPLATE_REG

        next if ( $seen_last_template_flag );

        # Look for start of a template. If we find one, call the appropriate
        # subroutine.

        if ( /$TEMPLATE_START_REG(.*)$TEMPLATE_END_REG/i ) {

            my $template_name = uc &strip_space($1);

            # Template names can be very distorted. The checks below can
            # cope with a worst case '#[   ADRESSING    PLAN           ]#'
            # We need to do the checks HERE, rather than in Check.pm
            # because if we can't find the template name we can't
            # work out what type of template it is, and hence how to
            # extract it.

            # Replace too much white space between the components of the
            # name with a single space

            $template_name =~ s/\s+/ /g;

            # Correct some possible misspellings of template names

            $template_name = &correct_template_name_misspellings($template_name);

            # If they didn't put the " TEMPLATE" bit on the end, add it

            $template_name .= $TEMPLATE_NAME_EXTENSION
                if ( &is_template_name( $template_name.$TEMPLATE_NAME_EXTENSION ) );

            my $template_key =  &get_template_key($template_name);

            # extract a text template, also if we don't recognise the
            # template (i.e. we can't find a key)

            if (! $template_key || &template_type($template_key) eq 'TEXT') {
                &extract_text_template($template_name);
                redo MAINLOOP;
            }
            
            # extract a plan template

            elsif ( &template_type($template_key) eq 'PLAN') {
                &extract_plan_template($template_name);
                redo MAINLOOP;
            }
            
            # extract field template

            elsif ( &template_type($template_key) eq 'FIELD' ) {
                &extract_field_template($template_name);
                redo MAINLOOP;
            }
            
            # erronous template type in config file!

            else {
                &fatal("Erronous template type in config file: key=$template_key, type=".&template_type($template_key));
            }
        }

        elsif ( &looks_like_possible_corrupt_template_header($_) ) {

            &prob( $TEMPLATES[0], 'LOOKS_LIKE_BAD_TEMPLATE_HEADER', $_ );
            
        }
    }

    #close the temporary file

    close TEMP ||
        fatal("$! while closing pipe from $fromcust_file.");

    # Show the templates we extracted if flag set

    if ( $DETAIL_EXTRACT_FLAG ) {

        my $template_ref;

        foreach $template_ref (@TEMPLATES) {
            &rough_show_template ($template_ref);
        }
    }
}

# Purpose       :       Extract a template which consists purely of lines of text.
#                       At 971113 this includes only the overview of organisation template.
# In            :       $: name of template to extract
# Out           :       VOID
#
sub extract_text_template {

    my $template_ref = {};                              # Make a new hash

    $template_ref->{$F{TNAME}} = shift @_;              # Set it's name

    &dprint("EXTRACTING TEXT template:\t*".$template_ref->{$F{TNAME}}."*");

    $template_ref->{$F{TEXT}} = "";                     # Initialize the field text will go into

    push @TEMPLATES, $template_ref;                     # Add this hash to array with rest

    while (<TEMP>) {

        $TEMPLATES[0] { $F{EMAILBODY} } .= $_;      # Add line to 'body' field in meta template

        &preprocess_line;                       # Remove 'forwarded line' characters at 
                                                # the start of the line, like '>'.

        return if ( /$TEMPLATE_START_REG/i || /$TEMPLATE_END_REG/i );   # Stop when we see start of another template

        $template_ref->{$F{TEXT}} .= $_;        # Keep template in a single string
    }
}

# Purpose       :       Extract a template which consists of lines of subnets and
#                       relevant info, plus possibly a totals line at the end.
#                       Each line of the plan extracted to a hash with the key being
#                       the number of the line. This makes it easy to reproduce in
#                       order later on.
# In            :       $: name of template to extract
# Out           :       VOID
#
sub extract_plan_template {

    my $template_ref = {};                      # Make a new hash

    $template_ref->{$F{TNAME}} = shift @_;      # Set it's name

    &dprint("EXTRACTING PLAN template:\t*".$template_ref->{$F{TNAME}}."*");

    push @TEMPLATES, $template_ref;             # Add this hash to array with rest

    my $line_number = 1;                        # Track which line of the plan we're on
    
    my $last_unmatched_line = "";               # Store a line which didn't match
                                                # to try other stuff with it later

    my $added_a_combined_line_flag = 0;         # Track whether we've had to stitch the
                                                # plan a bit

    my $combined_line;                          # Variable for stitching together
                                                # previous subnet and current one

    my $looking_for_description_flag = 0;       # Signal whether the last subnet
                                                # matched had a description text

    while ( <TEMP> ) {

        $TEMPLATES[0] { $F{EMAILBODY} } .= $_;      # Add line to 'body' field in meta template

        &preprocess_line;                       # Remove 'forwarded line' characters at 
                                                # the start of the line, like '>'.

        # Skip blank lines: were not interested and skipping them avoids
        # potential problems later when extracting subnets which
        # are on different lines

        # Lee 19990413
        # Reset $looking_for_description_flag if we get a blank
        # line. We only want to check the next line for a description.

        if ( /^\s*$/ ) {
            $looking_for_description_flag = 0;
            next;
        } 
        
        # Stop when we see start of another template

        return if ( /$TEMPLATE_START_REG/i || /$TEMPLATE_END_REG/i );

        # Stitch together the last line and this line. We're about to test
        # to see if the two together form a valid subnet line. This occurs
        # when the subnet gets split e.g. during forwarding

        $combined_line = $last_unmatched_line.$_;

        # If we match a proper subnet line...
        #    
        # ( the & matches an ampersand which appeared in front of each figure
        # on old versions of the request form )

        # Lee 19990413
        # Last space from \s+ to \s*. Why not let them accidentally
        # miss out the space before the description ?

        if ( /  ^\s*
            &*([\d\.]{3,})                              \s+     # Prefix
            &*([\d\.]{3,})                              \s+     # Subnet Mask
            &*(\d+)                                     \s+     # Size
            &*(\d+)                                     \s+     # Current Immediate Usage
            &*(\d+)                                     \s+     # Year 1 Usage
            &*(\d+)                                     \s*     # Year 2 Usage
            &*(.*)                                              # Description
            $   
            /x ) {

            my ( $prefix, $mask, $size, $imm, $yr1, $yr2, $descr ) =
                ( $1, $2, $3, $4, $5, $6, $7 );

            # Lee 19980211 added the check for an empty subnet
            # Potential problem with this: if they put this on the
            # addressing plan, they'll get 'no subnets in plan' error.
            # This isn't really *so* unreasonable a report.

            # Lee 19990413
            # Ignore completely 0.0.0.0 0.0.0.0 0 0 0
            # (moved closing bracket down 3 statements)

            if ( ! ( 
                    $prefix =~ /^[0\.]+$/ && 
                    $mask =~ /^[0\.]+$/ &&
                    $imm == 0 && $yr1 == 0 && $yr2 == 0 && $size == 0
                   ) 
            ) {

                # ...add it to the hash under current line number

                $template_ref->{$line_number++} = 
                    join $PLAN_FS, $prefix, $mask, $size, $imm, $yr1, $yr2, $descr;

                # We don't want to stitch this subnet with something else, since
                # we've already used it. And we certainly don't want to stitch
                # something AFTER this subnet with something BEFORE this subnet
                
                $last_unmatched_line = "";
                
                # Check if we found a description line. If we didn't, signal that
                # we'd like to use the next line as a description if possible
                
                $looking_for_description_flag = 1
                    if ( $descr =~ /^\s*$/);
            }
        }

        # elsif we can match a subnet by combining this line and the last one...
        # ( the & is to match an ampersand which appeared in front of each figure
        # on old versions of the request form )

        # Lee 19990413
        # Last space from \s+ to \s*. Why not let them accidentally
        # miss out the space before the description ?

        elsif ( $combined_line =~ / ^\s*
            &*([\d\.]{3,})                              \s+     # Prefix
            &*([\d\.]{3,})                              \s+     # Subnet Mask
            &*(\d+)                                     \s+     # Size
            &*(\d+)                                     \s+     # Current Immediate Usage
            &*(\d+)                                     \s+     # Year 1 Usage
            &*(\d+)                                     \s*     # Year 2 Usage
            &*(.*)                                              # Description
            $   
            /x ) {

            my ( $prefix, $mask, $size, $imm, $yr1, $yr2, $descr ) =
                ( $1, $2, $3, $4, $5, $6, $7 );

            # Lee 19980211 added the check for an empty subnet
            # Potential problem with this: if they put this on the
            # addressing plan, they'll get 'no subnets in plan' error.
            # This isn't really *so* unreasonable a report.

            # Lee 19990413
            # Ignore completely 0.0.0.0 0.0.0.0 0 0 0
            # (moved closing bracket down 3 statements)

            if ( ! ( 
                    $prefix =~ /^[0\.]+$/ && 
                    $mask =~ /^[0\.]+$/ &&
                    $imm == 0 && $yr1 == 0 && $yr2 == 0 && $size == 0
                   )
            ) {

                # ...add it to the hash under current line number

                $template_ref->{$line_number++} = 
                    join $PLAN_FS, $prefix, $mask, $size, $imm, $yr1, $yr2, $descr;

                # Inform the requester that we've done so, but only once per plan

                if ( ! $added_a_combined_line_flag ) {
                    &prob($template_ref, 'COMBINED_SOME_LINES_ON_PLAN');
                    $added_a_combined_line_flag = 1;
                }   
                    
                # Stop it matching again next time by resetting the last line.

                $last_unmatched_line = "";
                
                # Check if we found a description line. If we didn't, signal that
                # we'd like to use the next line as a description if possible
                
                $looking_for_description_flag = 1
                    if ( $descr =~ /^\s*$/);
            }
        }

        # elsif we've matched a totals line
        # ( the & is to match an ampersand which appeared in front of each figure
        # on old versions of the request form)

        # Changed 980211 lee added [toalsTOALS] to the match to account
        # for strings like 'totals' before the totals

        elsif ( /^[\s&toalsTOALS]*
               (\d+)                                    \s+     # Total subnet size
               &*(\d+)                                  \s+     # Immediate usage
               &*(\d+)                                  \s+     # Yr2 usage
               &*(\d+)                                          # Yr3 usage
               /x ) {

            &add_to_field(\$template_ref->{$F{APTOTALS}},"$1$PLAN_FS$2$PLAN_FS$3$PLAN_FS$4");

            # We can't use the totals line as the description, and whatever comes
            # after the totals line certainly isn't a description for the previous
            # subnet

            $looking_for_description_flag = 0;
        }

        # elsif we didn't find a description last time we matched a subnet line...
        # Since we got this far, the current line doesn't hold a subnet or any
        # totals, so we use it as the description of the PREVIOUS subnet

        elsif ( $looking_for_description_flag ) {

            # only look for description on next following line

            $looking_for_description_flag = 0;

            # If it's a valid description text, add it to the LAST
            # line of the plan
            # $looking_for_description_flag is only set when there's a match,
            # so we're guaranteed that there IS such a previous line

            if ( $_ !~ /$NOT_USE_AS_DESCR_REG/) {

                $template_ref->{$line_number - 1} .= $_; 

                # So that we don't try to use it as part of a combined
                # line next time round

                $last_unmatched_line = "";
            }
        }

        # otherwise, store the LAST none-matching line anyway : can look in here for
        # near-matches later. This is important: if we miss a subnet this can
        # cause havoc with later analysis. We don't store the CURRENT unmatching
        # line, because if we manage to combine it with the next line to form
        # a line, we don't want it also sitting in REMAINDER

        else {
            
            # The conditional are purely necessary to avoid 'use of uninitialized
            # value' error from compiler :(

            if ( ! defined $template_ref->{$F{REMAINDER}}) {
                $template_ref->{$F{REMAINDER}} = $last_unmatched_line;
            }
            else {
                $template_ref->{$F{REMAINDER}} .= $last_unmatched_line;
            }

            # So that we don't try to use it as part of a combined
            # line next time round

            $last_unmatched_line = $_;
        }
    }
}

# Purpose       :       Extract a template which consists of lines of the rough form 
#                       'field: value' (precisely defined in $FIELD_REG).
# In            :       $: name of template to extract
# Out           :       VOID
#
sub extract_field_template {

    my $template_name = shift @_;

    &dprint("EXTRACTING FIELD template:\t*$template_name*");

    my $template_ref = {};                              # Make a new hash

    $template_ref->{$F{TNAME}} = $template_name;        # Set it's name

    # We need a variable to flag whether we found any non-empty fields.
    # This is so that we don't save the template if there are no usable fields.

    my $non_empty_fields_flag = 0;

    my $template_key = &get_template_key($template_name);

    my ( $field_name, $value, $known_field_name );
    my $found_end_of_header_flag = 0;

    while ( <TEMP> ) {

        if ( $template_key eq 'T_META' ) {

            # If it's the meta template and we've found the end
            # of the mail header, note the fact. We need to do this
            # because otherwise we might extract Cc:'s etc from forwarded
            # messages.

            $found_end_of_header_flag = 1
                if /^$/;
        } 
        else {

            # Add line to 'body' field in meta template
            
            $TEMPLATES[0] { $F{EMAILBODY} } .= $_;
    
            # Remove any forwarding symboles (e.g. '>') at the start
            # of a line. Don't do this if we're extracting the meta
            # template, since the mail header shouldn't have these
            # symbols before each entry.

            &preprocess_line;
        }

        # Two exit conditions:
        # 1) We find the start of another template
        # 2) if we're extracting the meta template, and we find an enclosed
        # forwarded message, we don't care about it so we stop

        if ( /$TEMPLATE_START_REG/i || /$TEMPLATE_END_REG/i ) {

            # If we found some fields with data in, add the template to the rest.

            push @TEMPLATES, $template_ref
                if ( $non_empty_fields_flag );
            
            return;
        }                       

        # If we see a field-like string and we've not overshot the header on
        # the meta template...

        if ( ! $found_end_of_header_flag && /$FIELD_REG/i ) {                  

            $field_name = $1; $value = $2;              # Record the field name and value

		 	&dprint("Found: *$field_name* *$value*")
				if ($DETAIL_EXTRACT_FLAG);

            # If the field matches a known field name case-insensitively
            # then we use the known name. This avoid a problem when they
            # change the case of letters in a field name on the form.
            # This also avoid the problem that email header field
            # names are case insensitive.

            foreach $known_field_name (keys %F) {

                if ( $field_name =~ /^$F{$known_field_name}$/i ) {
                    $field_name = $F{$known_field_name};
                    last;
                }
            }

            # If they include a field with a name the same as
            # one of the fields we use for internal tracking,
            # we ignore it, to avoid potential problems later on.

            next    
                if ( &is_internal_field_name($field_name) );

            # If field already defined, it means that the field_name
            # has been repeated on the form. The program handles these
            # by keeping multiple lines all in one hash value, but
            # separated by a field separator, $FORM_FS. This bit of codes
            # adds the $FORM_FS if necessary.

            if ( defined $template_ref->{$field_name} ) {

                # meta template -> ignore this field, shouldn't have
                # double entries of mailer lines anyway.
                next
                        if ( $template_key eq 'T_META' );

                ### otherwise -> add field separator ###
                $template_ref->{$field_name} .= $FORM_FS
                        if ( $template_ref->{$field_name} !~ /^\s*$/ );

            }
            
            # else field has not already been seem, define its intial value.

            else {      
                $template_ref->{$field_name} = "";

                push @ {$template_ref->{$F{FIELD_ORDER}}}, $field_name;
            } 

            $value =~ s/\(optional\)//; # dump possible 'optional' tag from form

            # Add the value to the field if it's not just whitespace.

            if ( $value !~ /^\s*$/) {

                $value = &strip_space($value);  # dump leading/trailing whitespace

                # flag that we've found a non-empty field unless it's the
                # 'source: RIPE' fields that occurs with database templates and
                # is pre-filled-in on the form

                $non_empty_fields_flag = 1
                    unless ( &is_db_template($template_key) && $field_name eq $F{SOURCE} && $value =~ /RIPE/i ); 

                    $template_ref->{$field_name} .= $value; #add value to the existing
                                                        #entry for this field
            }
        }

        # .... otherwise store it as junk anyway. This could be useful later 
        # e.g. meta template fluff is searched for a regid

        elsif  ( $template_name eq &template_name('T_META') ) {
            
            # These 2 lines are purely necessary to avoid 'use of uninitialized
            # value error from compiler

            if ( ! defined $template_ref->{$F{REMAINDER}}) {
                $template_ref->{$F{REMAINDER}} = $_;
            }
            else {
                $template_ref->{$F{REMAINDER}} .= $_;
            }
        }
    }

    # If we get here, we ran out of data while extracting the template.
    # This doens't matter with the other two template types, because
    # they push the template on @TEMPLATES before beginning the extract.
    # We don't do that when extracting a field, because we want to see
    # if the template had any data in it first.
	# Even if there were no fields in the meta template, we still
	# make it, since rest of program relies on there being such
	# a template.

    push @TEMPLATES, $template_ref
        if ( $non_empty_fields_flag || $template_name eq &template_name('T_META'));
}

1;
