#!/usr/bin/perl

use strict;
use warnings;
use utf8;

use LWP::Simple;
use File::Basename;
use POSIX qw(strftime);

our $NSS_EXCL = qr/example\.org/o;

sub main {
	my ($nss_fn, $names_fn, $attrs_fn) = @_;

	unless (defined $nss_fn && defined $names_fn && defined $attrs_fn) {
		print STDERR "Usage: $0 <nss_fn> <names_fn> <attrs_fn>\n";
		return (1);
	}

	my @xeps;
	my $xep;

	while (<STDIN>) {
		my $line = $_;

		# Skip empty lines and comments.
		next if $line =~ /^\s*$/o || $line =~ /\s*#/o;

		if ($line =~ /^XEP:\t\t(.+)$/o) {
			push @xeps, $xep if defined $xep;
			$xep = {'id' => $1};
		} elsif ($line =~ /^([A-Z]+):\t\t(.+)$/o) {
			$xep->{lc($1)} = $2;
		}
	}
	push @xeps, $xep if %$xep;

	# If the XEP list is empty, exit.
        return (0) unless (@xeps);

	# Open files.
	open(NSS,   ">$nss_fn");
	open(NAMES, ">$names_fn");
	open(ATTRS, ">$attrs_fn");

	# Print header.
	my $script = basename($0);
	my $revision = '$Revision$' =~ /Revision: ([\d]+)/ ? $1 : 0;
	my $now = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime;

	my $header = << "EOF";
# Generated by $script r$revision on $now
# vim:ft=conf:
EOF
	print NSS $header;
	print NAMES $header;
	print ATTRS $header;

	foreach $xep (@xeps) {
		# Print XEP entry.
		my $id = $xep->{'id'};
		my $url = $xep->{'url'};
		my $name = $xep->{'name'};
		my $type = $xep->{'type'};
		my $status = $xep->{'status'};
		my $date = $xep->{'date'};

		print "$id: $name\n";

		# Fetch page.
		my $content = get($url);
		unless (defined $content) {
			print STDERR "  Can't retrieve URL '$url'\n";
			next;
		}

		# Check wether we have an XML schema or a DTD.
		my $result;
		if ($content =~ /xs:schema.+targetNamespace/so) {
			# Found a schema!
			print "  Read XML schema(s)\n";
			$result = extract_from_schema($xep, $content);
		} elsif ($content =~ /!ELEMENT/so) {
			# Found a DTD!
			print "  Read DTD\n";
			$result = extract_from_dtd($xep, $content);
		} else {
			next;
		}

		next unless $result;

		my $entry = << "EOF";

# --------------------------------------------------------------------
# XEP:		$id
# NAME:		$name
# URL:		$url
EOF
		$entry .= "# STATUS:	$status\n" if $status;
		$entry .= "# TYPE:		$type\n" if $type;
		$entry .= "# DATE:		$date\n" if $date;

		print NSS "$entry\n";
		print NAMES $entry;
		print ATTRS $entry;

		foreach my $ns (sort keys %$result) {
			print NSS "$ns\n";

			my @names = @{$result->{$ns}->{'names'}};
			print NAMES "\n# $ns\n";
			print NAMES join("\n", @names)."\n" if (@names);

			my @attrs = @{$result->{$ns}->{'attrs'}};
			print ATTRS "\n# $ns\n";
			print ATTRS join("\n", @attrs)."\n" if (@attrs);
		}
	}

	close(NSS);
	close(NAMES);
	close(ATTRS);
}

sub extract_from_schema ($$) {
	my ($xep, $content) = @_;

	my $result;
	my $ns;

	foreach my $line (split /[\r\n]+/, $content) {
		if ($line =~ /targetNamespace=['"]([^'"]+)['"]/o) {
			$ns = $1;
			if ($ns =~ $NSS_EXCL) {
				print "  Skip namespace '$ns'\n";
				undef $ns;
				next;
			}

			unless (exists $result->{$ns}) {
				print "  Add namespace '$ns'\n";
				$result->{$ns} = {
					'names' => [],
					'attrs' => []
				};
			}
		}

		next unless (defined $ns);

		if ($line =~ /xs:element\s+name=['"]([^'"]+)['"]/o) {
			my $name = $1;
			push @{$result->{$ns}->{'names'}}, $name
			  unless grep(/^$name$/, @{$result->{$ns}->{'names'}});
		} elsif ($line =~ /xs:attribute\s+name=['"]([^'"]+)['"]/o) {
			my $attr = $1;
			next if $attr =~ /^xml:/;
			push @{$result->{$ns}->{'attrs'}}, $attr
			  unless grep(/^$attr$/, @{$result->{$ns}->{'attrs'}});
		}
	}

	return ($result);
}

sub extract_from_dtd ($$) {
	my ($xep, $content) = @_;

	my $result;
	my $ns;

	foreach my $line (split /[\r\n]+/, $content) {
		if ($line =~ /Short\s+Name:\s*([^\s]+)<br/io) {
			$ns = $1;
			if ($ns =~ $NSS_EXCL) {
				print "  Skip namespace '$ns'\n";
				undef $ns;
				next;
			}

			unless (exists $result->{$ns}) {
				print "  Add namespace '$ns'\n";
				$result->{$ns} = {
					'names' => [],
					'attrs' => []
				};
			}
		}

		next unless (defined $ns);

		if ($line =~ /!ELEMENT\s+([a-zA-Z0-9-]+)/o) {
			my $name = $1;
			push @{$result->{$ns}->{'names'}}, $name
			  unless grep(/^$name$/, @{$result->{$ns}->{'names'}});
		} elsif ($line =~ /!ATTLIST\s+[^\s]+\s+([^\s]+)/o) {
			my $attr = $1;
			next if $attr =~ /^xml:/;
			push @{$result->{$ns}->{'attrs'}}, $attr
			  unless grep(/^$attr$/, @{$result->{$ns}->{'attrs'}});
		}
	}

	return ($result);
}

main(@ARGV);
