Ok, I have tried to reduce the program as much as possible. Still there
is quite some code, sorry for that. Hope that someone will find it
interesting enough to take a look at. The program illustrates how the
same code generates different results every second run. I don't get it.
Hopefully someone else does!
Command: divide.perl lexicon.xml
The first time the program is run, the entry "affärsman" is
categorised.
Next run it's not. Third time it is again categorised...
Thanks a lot,
D
First a sample xml data, file name "lexicon.xml":
<lexicon>
<entry id='121' gender='utr' lemma='affärsman' pos='substantiv'>
<word orth='affärsman' tag='sin-ind-nom'>
<transcription string='xxx'/>
</word>
<word orth='affärsmans' tag='sin-ind-gen'>
<transcription string='xxx'/>
</word>
<word orth='affärsmannen' tag='sin-def-nom'>
<transcription string='xxx'/>
</word>
<word orth='affärsmannens' tag='sin-def-gen'>
<transcription string='xxx'/>
</word>
<word orth='affärsmän' tag='plu-ind-nom'>
<transcription string='xxx'/>
</word>
<word orth='affärsmäns' tag='plu-ind-gen'>
<transcription string='xxx'/>
</word>
<word orth='affärsmännen' tag='plu-def-nom'>
<transcription string='xxx'/>
</word>
<word orth='affärsmännens' tag='plu-def-gen'>
<transcription string='xxx'/>
</word>
</entry>
<entry id='6' gender='utr' lemma='affär' pos='substantiv'>
<word orth='affär' tag='sin-ind-nom'>
<transcription string='xxx'/>
</word>
<word orth='affärs' tag='sin-ind-gen'>
<transcription string='xxx'/>
</word>
<word orth='affären' tag='sin-def-nom'>
<transcription string='xxx'/>
</word>
<word orth='affärens' tag='sin-def-gen'>
<transcription string='xxx'/>
</word>
<word orth='affärer' tag='plu-ind-nom'>
<transcription string='xxx'/>
</word>
<word orth='affärers' tag='plu-ind-gen'>
<transcription string='xxx'/>
</word>
<word orth='affärerna' tag='plu-def-nom'>
<transcription string='xxx'/>
</word>
<word orth='affärernas' tag='plu-def-gen'>
<transcription string='xxx'/>
</word>
</entry>
</lexicon>
First program - the one reloading another one.
Name "divide.perl"
########################################
#!/usr/bin/perl
use Module::Reload;
#$Module::Reload:
ebug = 3;
use Nouns_minimal;
# XML::Simple for parsing xml file
use XML::Simple;
#use Data:
umper;
use strict;
no strict 'refs';
#use warnings;
#use diagnostics;
##############################################################################
# VARS
#---------------------------------
my $INPUT_FILE = $ARGV[0]; # lexicon xml file
our $XML_ENTRY = undef;
our $ENTRY_NUMBER = undef;
my $LOOP = 1; # boolean for controlling when rereading xml-file
##################################################################################
# MAIN
##################################################################################
print STDERR "Parsing lexicon XML file...\n";
# Parse XML into arrays and hashes
# We use "ForceArray" for "word" and "transcription" since these
sometimes are only one
# entity - but still we need them in an array - otherwise our code gets
jammed...
my $lexicon = undef;
# time the parsing
$lexicon = XMLin($INPUT_FILE, ForceArray => [ 'word', 'transcription'
]);
# get num entries in lexicon
my $tot_num_entries = scalar (keys %{$lexicon->{entry}});
print STDERR "Found $tot_num_entries entries in lexicon.\n";
print STDERR "Done.\n";
#print Dumper($lexicon);
#print XMLout($lexicon);
my $answ = "n"; # defualt answ
while ($LOOP) {
# INIT GLOBAL VARS
#init_vars();
print STDERR "\nNow trying to divide lexicon into sub
categories...\n";
# loop over all entries
foreach $XML_ENTRY (%{$lexicon->{entry}}) {
# ANALYSE depending on POS
if (defined $XML_ENTRY->{pos} && $XML_ENTRY->{pos} =~ /substantiv/o)
{
Nouns_minimal::check_noun(\$XML_ENTRY);
} else {
$ENTRY_NUMBER = $XML_ENTRY;
print STDERR "Looking at entry number $ENTRY_NUMBER\n";
}
} # while infile
# Ask if we shall go again...
print "\nAnother run? (y/n): ";
$answ = <STDIN>;
unless ($answ =~ /y/) {
$LOOP = 0;
}
# Reload Nouns_minimal...
Module::Reload->check;
} # while loop
__END__
Then the Nouns_minimal.pm
##########################
# NOUNS
package Nouns_minimal;
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(check_noun);
use strict;
no strict 'refs';
###########################################
# gets a full entry
# Returns 1 (noun was categorised) or 0 (not categorized)
sub check_noun {
# first arg is the full entry
my $entry = ${$_[0]};
# catch some useful fullforms
my $plu_ind_nom = "";
my $plu_def_nom = "";
my $sin_ind_nom = "";
my $sin_def_nom = "";
# get some useful fullforms
foreach my $ff (@{$entry->{word}}) {
if ($ff->{tag} =~ /plu-ind-nom/) {
$plu_ind_nom = $ff;
}
elsif ($ff->{tag} =~ /sin-ind-nom/) {
$sin_ind_nom = $ff;
}
elsif ($ff->{tag} =~ /sin-def-nom/) {
$sin_def_nom = $ff;
}
elsif ($ff->{tag} =~ /plu-def-nom/) {
$plu_def_nom = $ff;
}
}
# FIND CORRECT DECLINATION
#######################################
my $classified = 0; # flag to signal if the word was classified or not
#===============
# PC7a
#===============
# Irregular nouns UTR (man - män, mus-möss)
if (
$plu_ind_nom # reuire plural form!
&& $entry->{gender} eq "utr" # noun is utrum
&& length($plu_ind_nom->{orth}) == length($entry->{lemma})# lemma
and plu same length
&& &has_vowelshift($sin_ind_nom,$plu_ind_nom) # shifts vowel
) {
print STDERR ("$entry->{lemma} is PC7a\n");
}
#######################################################
} # sub check_noun
#############################################################################
#############################################
# Check if a noun has a vowelshift
# Compares singular nom and plural nom
# Returns 1 if true, 0 if no shift
#############################################
sub has_vowelshift {
my $sin_ind_nom = shift;
my $plu_ind_nom = shift;
my $vowels = "[aoiyåäöeu]";
my @vowels = qw(a o i y å ä ö e u);
print STDERR ("CHECKING STEMCHANGE: '$sin_ind_nom->{orth}' VS
'$plu_ind_nom->{orth}'\n");
# loop over all vowels in lemma and try shifting them
while ($sin_ind_nom->{orth} =~ /($vowels)/g) {
my $v = $1;
my $left = $`;
my $right = $';
print STDERR ("Found vowel: '$`' '$v' '$''\n");
foreach my $vow (@vowels) {
unless ($v eq $vow) { # don't try shifting e.g. "ä" -> "ä"...
my $temp_nom = $sin_ind_nom->{orth};
# shift vowels:
$temp_nom =~ s/$left$v$right/$left$vow$right/;
# if shifting one vowel produces the plural form - we have a shift!
# like "man" - "män"
if ($temp_nom eq $plu_ind_nom->{orth}) {
return 1;
} # if match
} # unless
} # for each
} # while all vowels in lemma
# if we get here - no shifting found - return false
return 0;
} # sub has_vowelshift
1;
__END__