#!/usr/local/bin/perl
# $Id: drop4.pl,v 1.2 2001/06/07 09:32:56 mahl Exp mahl $

#ecrire les balises avec tous leurs attributs values
sub write{
local($token)=@_;
if ($$token[0] eq "S"){print N_HTML "$$token[4]";}
if ($$token[0] eq "E"){print N_HTML "$$token[2]";}
if ($$token[0] eq "T"){print N_HTML "$$token[1]";}
if ($$token[0] eq "C"){print N_HTML "$$token[1]";}
if ($$token[0] eq "D"){print N_HTML "$$token[1]";}
}


#enlever img ,a href-img-/a,liste section,pied de page 

sub drop{
local($file)=@_;
$p = HTML::TokeParser->new("$file");
print "TRAITE: $file\n";
@url=split '/',$file;
$fich=pop @url;
$repertoire=pop @url;
open N_HTML,">./leg_euro/$fich"||die "LE REP leg_euro N EST PAS CREE";
my $token = $p->get_token;
my $save = $p->get_token;
$bloc=0;
$flag=0;
$com=0;
@info=("-","-","-","-","-","-","-","-","-","-","-","-");
$sign=0;
$plus=0;
$desc="";
while ($token){
	#bascule pour enlever les bloc html inutiles
	if(($bloc == 0 && $$token[0] eq "T" && $$token[1] eq "Chapitres du répertoire où le document peut être trouvé:")
	||($bloc == 1 && $$token[0] eq "E" && $$token[1] eq "b")
	||($bloc == 2 && $$token[0] eq "S" && $$token[1] eq "br")
	||($bloc == 3 && $$token[0] eq "S" && $$token[1] eq "hr")
	||($bloc == 4 && $$token[0] eq "C" && $$token[1] eq "<!-- Page footer - main column -->")
	||($bloc == 5 && $$token[0] eq "E" && $$token[1] eq "tr")){$bloc++;}

	#recuperation DESCRIPTIF	
	if ($bloc == 4 && $sign==2 && $$token[0] eq "E" && $$token[1] eq "b"){
		$desc=$repertoire."\t".$desc."\t".$fich;
		#####ligne ajoutee####
		$desc=~s/\"/ /;
		print "desc-----$desc\n";
		open(DESC,"grep \"$desc\" desc.txt|")||die "desc.txt non encore cree\n";
		$maze=<DESC>;
		close(DESC);
		if(!$maze){
			open (DESC,">>desc.txt")|| die "impossible de creer desc.txt \n";
			print DESC "$desc\n";
			close(DESC);
		}
	$sign++;
	}
	if ($bloc == 4 && $sign==2 && $$token[0] eq "T"){$desc.=$$token[1]}
	if ($bloc == 4 && $sign==1 && $$token[0] eq "S" && $$token[1] eq "b"){$sign++;}	
	if ($bloc == 4 && $sign==0 && $$token[0] eq "S" && $$token[1] eq "p"){$sign++;}
	
	#ajouter un lien vers europa.eu.int en bas de page
	if($bloc==5 && $flag==0 ){
		$flag=1;
		print N_HTML "\n<!--INFODOC-->\n";
		print N_HTML "<!--REF--><!--$info[0]-->\n";
		print N_HTML "<!--NUMJO--><!--$info[1]-->\n";
		print N_HTML "<!--DATEJO--><!--$info[2]-->\n";
		print N_HTML "<!--PAGEJO--><!--$info[3]-->\n";
		print N_HTML qq(<A HREF="http://$file">consulter cette page sur europa.eu.int</A>\n);
	}
	
	#enlever les liens contenant des images
	if($$token[0] eq "S" && $$token[1] eq "a"){
		$save=$token;
		$token = $p->get_token;
		while($$token[0] eq "T" && $$token[1] eq "\n"){$token = $p->get_token;}
		if ($$token[0] eq "S" && $$token[1] eq "img"){
			while (!($$token[0] eq "E" && $$token[1] eq "a")){$token = $p->get_token;}
			$token = $p->get_token;
			}
		#mettre les liens en relatif
		else{	
			if($bloc % 2 == 0){
			@lien=split '/' ,$$save[2]->{href};
			$page=pop @lien;
			print N_HTML qq(<A HREF="$page">);
			}
			
		}
	}

	#enlever les images
	elsif($$token[0] eq "S" && $$token[1] eq "img"){$token = $p->get_token;}
	#bloc sans traitement
	else{   
		
		if($bloc == 0 && $$token[0] eq "T" && $com==0){
			@tmp=split ' ',$$token[1];
			if($tmp[0] eq "Document"){
				$com++;
				$info[0]=$tmp[1];
			}
		}
		elsif($bloc == 4 && $$token[0] eq "T" && $com==1){
			@tmp=split ' ',$$token[1];
			if($tmp[0] eq "Journal"){
				$com++;&write($token);
				$token=$p->get_token;print "2eme token-->  $$token[0]  ";
				while( $$token[0] eq "T"){
					@tmp1=@tmp;
					@tmp2=split ' ',$$token[1];
					@tmp=(@tmp1,@tmp2);
					&write($token);
					$token=$p->get_token;
				}
				print "INFO :";
				foreach(@tmp){print "  $_  ";}
				print "\n";
				if ($tmp[3] !~/^\d*$/){
					$info[1]=join ' ',$tmp[3],$tmp[4];
					$info[2]=$tmp[6];
					$info[3]=join ' ',$tmp[8],$tmp[9],$tmp[10];
				}
				else{
					$info[1]=$tmp[3];
					$info[2]=$tmp[5];
					$info[3]=join ' ',$tmp[7],$tmp[8],$tmp[9];
				}
				
			}
			if($tmp[0] eq "JO"){
				$com++;
				&write($token);
				$token=$p->get_token;print "2eme token-->  $$token[0]  ";
				while( $$token[0] eq "T"){
					@tmp1=@tmp;
					@tmp2=split ' ',$$token[1];
					@tmp=(@tmp1,@tmp2);
					&write($token);
					$token=$p->get_token;
				}
				print "INFO :";
				foreach(@tmp){print "  $_  ";}
				print "\n";
				if ($tmp[3] !~/^\d*$/){
					$info[1]=$tmp[1];
					$info[2]=$tmp[3];
					$info[3]=$tmp[4];
				}
				else{
					$info[1]=$tmp[1];
					$info[2]=$tmp[2];
					$info[3]=$tmp[3];
				}
			}
			elsif($temp[0] ne "Journal"){&write($token);}
		}
		elsif($bloc % 2 == 0){&write($token);}
		#if($$token[0] eq "C"&& $$token[1] eq "<!-- LIF TXT ENTRY END -->") {print "hello";print N_HTML "</TEXTE>\n";}
		$token = $p->get_token;
		#if($$token[0] eq "C"&& $$token[1] eq "<!-- LIF TXT ENTRY START -->") {print "godbye";print N_HTML "<TEXTE>\n";}	
	}#fin du else
}
}

sub parcours{
	local($dir)=@_;print "EXPLORE  $dir\n";
	foreach $i(glob("$dir/*")){
#traitement UNIQUEMENT des repertoire contenant les textes
		@tmpdir = split '/',$i;
		$rep=pop @tmpdir;print "rep  !$rep!\n";
		if ((-d $i) && (($rep ne "abc")&&($rep ne "ind")&&($rep ne "new")&&($rep ne "reg"))){
			if ($rep=~/\d/){
				open(YEAR,"grep $rep year.txt|")||die "year.txt non encore cree\n";
				$maze=<YEAR>;
				close(YEAR);
				if(!$maze){
					open (YEAR,">>year.txt")|| die "impossible de creer year.txt \n";
					print YEAR "$rep\n";
					close(YEAR);
				}
			}
			&parcours($i);
		}
		if (-f $i){&drop($i);}
	}
}
#main
use HTML::TokeParser;
$dir="europa.eu.int";
#####lignes ajoutees####
open(DESC,">desc.txt");
close(DESC);
########################
system("mkdir leg_euro");
&parcours($dir);

