#!/usr/bin/perl -w

# Copyright © 2001 Colin Watson
# Copyright © 2008 Jordà Polo
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, you can find it on the World Wide
# Web at http://www.gnu.org/copyleft/gpl.html, or write to the Free
# Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
# MA 02110-1301, USA.

# Should be run from the top level of the Lintian source tree or with
# LINTIAN_ROOT set appropriately.  You need copies of all the relevant manuals
# installed in the standard places locally (packages debian-policy,
# developers-reference, doc-base, python, lintian, menu, java-policy and
# vim-doc).

use strict;
use warnings;
use autodie;
use open qw(:std :utf8);

use File::Basename;
use List::MoreUtils qw(none);
use POSIX qw(strftime);

BEGIN {
    my $LINTIAN_ROOT = $ENV{'LINTIAN_ROOT'};
    if (not $LINTIAN_ROOT) {
        require Cwd;
        $ENV{'LINTIAN_ROOT'} = $LINTIAN_ROOT = Cwd::cwd();
    } else {
        chdir($LINTIAN_ROOT);
    }
}

my $LINTIAN_ROOT = $ENV{'LINTIAN_ROOT'};

# For each manual, we need:
#  * Location of the manual index on the local filesystem
#  * Base URL for the eventual target of the reference (or empty string if no
#    public URL is available)
#  * Regex to match the possible references
#  * Mapping from regex fields to reference fields (array of arrays of
#    keywords: url, section title; the position of each keyword in the array
#    defines which is its corresponding group in the regex)
#
# Optionally, if there are subsections that aren't available in the index, an
# additional regex can be defined to match possible references on other pages
# of the manual.

my $title_re = qr/<title\s?>(.+?)(?:\s+v[\d.]+)?<\/title\s?>/i;
my $link_re
  = qr/<link href="(.+?)" rel="[\w]+" title="([A-Z]|[A-Z]?[\d\.]+?)\.?\s+([\w\s[:punct:]]+?)">/;
my $index_re
  = qr/<a href="(.+?)">\s*([A-Z]|[A-Z]?[\d\.]+?)\.?\s+([\w\s[:punct:]]+?)\s*<\/a>/;
my $fields = [['url'], ['section'], ['title']];
my $dbk_index_re
  = qr/([\d.]+?)\.\s+<a\s*href="(.+?)"\s*>([\w\s[:punct:]]+?)<\/a\s*>/i;
my $dbk_fields = [['section'], ['url'], ['title']];

my %manuals = (
    'policy' => [
        '/usr/share/doc/debian-policy/policy.html/index.html',
        'https://www.debian.org/doc/debian-policy/',
        # $index_re would match the Policy's TOC and work otherwise, but
        # the TOC is generated only up to 3 levels deep as of 4.1.1.1, and
        # we want refs to all levels.
        qr{
              <(h[2-5])>\s*
              ([A-Z]|[A-Z]?[\d\.]+?)
              \.?\s+
              ([\w\s[:punct:]]+?)
              <a\s(?:[^>]*?)\bhref="([^"]+)".+?</a>
              </\1>
        }x,
        [['_ignored'], ['section'], ['title'], ['url']]
    ],
    'menu-policy' => [
        '/usr/share/doc/debian-policy/menu-policy.html/index.html',
        'https://www.debian.org/doc/packaging-manuals/menu-policy/',
        $index_re, $fields
    ],
    'perl-policy' => [
        '/usr/share/doc/debian-policy/perl-policy.html/index.html',
        'https://www.debian.org/doc/packaging-manuals/perl-policy/',
        $index_re, $fields
    ],
    'python-policy' => [
        '/usr/share/doc/python/python-policy.html/index.html',
        'https://www.debian.org/doc/packaging-manuals/python-policy/',
        $index_re, $fields
    ],
    'java-policy' => [
        '/usr/share/doc/java-policy/debian-java-policy/index.html',
        'https://www.debian.org/doc/packaging-manuals/java-policy/',
        $dbk_index_re,
        $dbk_fields
    ],
    'vim-policy' => [
        '/usr/share/doc/vim-doc/vim-policy.html/index.html',
        'http://pkg-vim.alioth.debian.org/vim-policy.html/',
        $dbk_index_re,
        $dbk_fields
    ],
    'lintian' => [
        '/usr/share/doc/lintian/lintian.html/index.html',
        'https://lintian.debian.org/manual/',
        $dbk_index_re, $dbk_fields
    ],
    'devref' => [
        '/usr/share/doc/developers-reference/index.html',
        'https://www.debian.org/doc/developers-reference/',
        $index_re,
        $fields,
#<<< no perl tidy
# breaking this regex up with x should work, but for some reason it
# trips a false positive in the minimum-version (thinking it requires
# perl5.17 for some bizarre reason).
        qr{<h[45] class="title"><a id="(.+?)"></a>([\d\.]+?)\.? ([\w\s[:punct:]]+?)</h[45]>},
#>>>
    ],
    'menu' => [
        '/usr/share/doc/menu/html/index.html',
        'https://www.debian.org/doc/packaging-manuals/menu.html/',
        $index_re, $fields
    ],
    'doc-base' => [
        '/usr/share/doc/doc-base/doc-base.html/index.html', '',
        $index_re, $fields
    ],
    'debconf-spec' => [
        '/usr/share/doc/debian-policy/debconf_specification.html',
        join(q{/},
            'https://www.debian.org',
            'doc/packaging-manuals/debconf_specification.html'),
        $index_re,
        $fields
    ],
    'fhs' => [
        '/usr/share/doc/debian-policy/fhs/fhs-2.3.html',
        'http://www.pathname.com/fhs/pub/fhs-2.3.html',
        qr/<a\s*href="(#.+?)"\s*>([\w\s[:punct:]]+?)<\/a\s*>/i,
        [['section', 'url'], ['title']]
    ],
);

# Check all of the manuals are present before trying anything
{
    my $ok = 1;
    foreach my $manual (sort keys %manuals) {
        my ($index, undef) = @{$manuals{$manual}};
        unless (-f $index) {
            print STDERR
              "Manual \"$manual\" not available (missing: $index)\n";
            $ok = 0;
        }
    }
    exit 1 unless $ok;
}

# extract_refs -- Extract manual references from HTML file.
#
# This function takes the output file handle, the path to the page, and the
# regex to match, and prints references to stdout. The second argument is used
# to decide whether to look for the title (0) or not (1). It returns a list of
# pages linked by the extracted references.
sub extract_refs {
    my ($fh, $manual, $title, $page, $url, $ref_re, $fields) = @_;
    my @linked_pages = ();

    open(my $page_fd, '<', $page);

    # Read until there are 2 newlines. This hack is needed since some lines in
    # the Developer's Reference are cut in the middle of <a>...</a>.
    local $/ = "\n\n";

    my %seen_sections;

    while (<$page_fd>) {
        if (not $title and m/$title_re/) {
            $title = 1;
            my @out = ($manual, '', $1, $url);
            print $fh join('::', @out) . "\n";
        }

        while (m/$ref_re/g) {
            my %ref;
            for (my $i = 0; $i < scalar @{$fields}; $i++) {
                foreach my $c (@{$fields->[$i]}) {
                    my $v = $i + 1;
                    $ref{$c} = eval '$' . $v;
                }
            }

            if ($ref{url} =~ m/^(.+?\.html)#?/i) {
                push(@linked_pages, $1) if none { m/$1/ } @linked_pages;
            }

            # If the extracted URL part doesn't look like a URL, assume it is
            # an anchor and convert to URL accordingly.
            if ($ref{url} and not $ref{url} =~ m/(?:#|\.html$)/i) {
                $ref{url} = basename($page) . "#$ref{url}";
            }

            $ref{title} =~ s/\s+/ /g;
            $ref{title} =~ s,<span[^>]*>(.*?)</span ?>,$1,ig;
            $ref{title} =~ s,<code[^>]*>(.*?)</code ?>,<code>$1</code>,ig;

            $ref{url} = "$url$ref{url}";
            $ref{url} = '' if not $url;

            $ref{section} =~ s/^\#(.+)$/\L$1/;
            # Some manuals reuse section numbers for different references,
            # e.g. the Debian Policy's normal and appendix sections are
            # numbers that clash with each other. Track if we've already
            # seen a section pointing to some other URL than the current one,
            # and uniquify it by appending as many asterisks as necessary.
            $ref{section} .= '*'
              while (defined($seen_sections{$ref{section}})
                && $seen_sections{$ref{section}} ne $ref{url});
            $seen_sections{$ref{section}} = $ref{url};

            my @out = ($manual, $ref{section}, $ref{title}, $ref{url});
            print {$fh} join('::', @out) . "\n";
        }
    }

    close($page_fd);

    return @linked_pages;
}

# Create a new reference file.
open(my $ref_fd, '>', 'data/output/manual-references.new')
  or die "Cannot create data/output/manual-references.new: $!\n";
my $date = strftime('%Y-%m-%d', localtime);
print {$ref_fd} <<"HEADER";
# Data about titles, sections, and URLs of manuals, used to expand references
# in tag descriptions and add links for HTML output.  Each line of this file
# has four fields separated by double colons:
#
#     <manual> :: <section> :: <title> :: <url>
#
# If <section> is empty, that line specifies the title and URL for the whole
# manual.  If <url> is empty, that manual is not available on the web.
#
# Last updated: $date

HEADER

for my $manual (sort keys %manuals) {
    my ($index, $url, $ref_re, $fields, $sub_re) = @{$manuals{$manual}};

    # Extract references from the index.
    my @subpages
      = extract_refs($ref_fd, $manual, 0, $index, $url, $ref_re, $fields);

    # Extract additional subsection references if not available in the index.
    next if not $sub_re;
    foreach my $pagename (@subpages) {
        my $page = dirname($index) . "/$pagename";
        extract_refs($ref_fd, $manual, 1, $page, $url, $sub_re, $fields);
    }
}

# Replace the old reference file.
close($ref_fd);
rename('data/output/manual-references.new', 'data/output/manual-references');

# Local Variables:
# indent-tabs-mode: nil
# cperl-indent-level: 4
# End:
# vim: syntax=perl sw=4 sts=4 sr et
