Форум программистов [Powered by Invision Power Board]

Код

#!/usr/bin/perl -w

use strict;

 my $directory = "/newsticker/meldung";
 opendir(DIR, $directory) || die "$directory: $!";
 my @files = readdir(DIR);
 closedir(DIR);

 my @output = ();
 my @input = @_;

 foreach(@files){

  open(FILE, "<$_");

  my $title = $_;
  @input = <FILE>;
 

 FIRST: foreach (@input) {
    my $line;
    my $i;
    my @words;
    $line = $_;
    chomp($line);
    if ($line =~ /^\s*$/) {
      next FIRST;
    }
    $line =~ s/^\s*//;
    $line =~ s/\s*$//;
    @words = split(/\s+/,$line);
    $line = "";
    $i = 0;
    while ($i <= $#words) {
      # remove punctuation from start of word
      if ($words[$i] =~ /^(["'\(\)\[\]\$:;,\/\%])(.+)$/ and 
          $words[$i] !~ /^'[dsm]$/i and $words[$i] !~ /^'re$/i and
          $words[$i] !~ /^'ve$/i and $words[$i] !~ /^'ll$/i) {
    splice(@words,$i,1,$1,$2);
    $i++;
    # remove sentence breaking punctuation with quote from end of word
      } elsif ($words[$i] =~ /^(.+)([?!\.])(["'])$/) {
    splice(@words,$i,1,$1,"$2$3"," ");
    # remove non-sentence-breaking punctuation from end of word
      } elsif ($words[$i] =~ /^(.+)([:;,"'\)\(\[\]\%])$/) {
    splice(@words,$i,1,$1,$2);
    # remove sentence-breaking punctuation (not period) from end of word
      } elsif ($words[$i] =~ /^(.+)([?!])$/ or
               $words[$i] =~ /^(.+[^\.])(\.\.+)$/) {
    splice(@words,$i,1,$1,$2," ");
    # separate currency symbol from value
      } elsif ($words[$i] =~ /^([A-Za-z]+\$)(.+)$/i) {
    splice(@words,$i,1,$1,$2);
    $i++;
    # separate currency symbol other symbols
      } elsif ($words[$i] =~ /^(.*)-\$(.*)$/i) {
    splice(@words,$i,1,$1,"-","\$",$2);
    $i++;
    # split words like we're did't etcetera
      } elsif ($words[$i] =~ /^(.+)('re|'ve|'ll|n't|'[dsm])$/i) {
    splice(@words,$i,1,$1,$2);
    # split words with punctuation in the middle
      } elsif ($words[$i] =~ /^(.*[a-z].*)([",\(\)])(.*[a-z].*)$/i) {
    splice(@words,$i,1,$1,$2,$3);
    # separate words linked with sequence (>=2) of periods
      } elsif ($words[$i] =~ /^(.*[^\.])(\.\.+)([^\.].*)$/) {
    splice(@words,$i,1,"$1$2",$3);
    # remove initial hyphens from word
      } elsif ($words[$i] =~ /^(-+)([^\-].*)$/ and $words[$i] ne "-DOCSTART-") {
    splice(@words,$i,1,$1,$2);
    
    # separate number and word linked with hyphen
      } elsif ($words[$i] =~ /^([0-9\/]+)-([A-Z][a-z].*)$/) {
    splice(@words,$i,1,$1,"-",$2);
    # separate number and word linked with period
      } elsif ($words[$i] =~ /^([0-9\/]+)\.([A-Z][a-z].*)$/) {
    splice(@words,$i,1,"$1.",$2);
    # separate number and word linked with period
      } elsif ($words[$i] =~ /^(.*)\.-([A-Z][a-z].*)$/) {
    splice(@words,$i,1,"$1.","-",$2);
    # separate initial from name
      } elsif ($words[$i] =~ /^([A-Z]\.)([A-Z][a-z].*)$/) {
    splice(@words,$i,1,$1,$2);
    # introduce sentence break after number followed by period
      } elsif ($i != 0 and $words[$i] =~ /^(.*[0-9])(\.)$/) {
    splice(@words,$i,1,$1,$2," ");
    # split words containing a slash if they are not a URI
      } elsif ($words[$i] !~ /^(ht|f)tps*/i and 
               $words[$i] =~ /[^0-9\/\-]/ and
               $words[$i] =~ /^(.+)\/(.+)$/) {
    splice(@words,$i,1,$1,"/",$2);
    # put sentence break after period if it is not an abbreviation
      } elsif ($words[$i] =~ /^(.+)(\.)$/ and $words[$i] !~ /^\.+$/ and
               $words[$i] !~ /^[0-9]+\./) {
     my $word = $1;
    if ($i != $#words and &abbrev($word)) {
      $i++;
    } else { 
      splice(@words,$i,1,$1,$2," ");
    }
      } else {
    $i++;
      }
    }
    $line = join(" ",@words);
    $line =~ s/\s+/ /g;
#    $line =~ s/ ([?!\.]) (["']) / $1 $2 /g;
#    $line =~ s/ *\n *//g;
#    $line =~ tr/A-Z/a-z/;
    push @output, $line;
  }
  my($retval) = join(" ", @output);
  chomp($retval);
 
}