# Library to wrap LWP::UserAgent to put in a random delay between
# requests and set the User-Agent string.  We really should be using
# LWP::RobotUI but this is better than nothing.
#
# If you're sure your app doesn't need a random delay (because it is
# fetching from a site designed for that purpose) then set
# $XMLTV::Get_nice::Delay to zero, or a value in seconds.  This is the
# maximum delay - on average the sleep will be half that.
#
#This random delay will be between 0 and 5 ($Delay) seconds. This means
# some sites will complain you're grabbing too fast (since 20% of your
# grabs will be less than 1 second apart). To introduce a minimum delay
# set $XMLTV::Get_nice::MinDelay to a value in seconds.
# This will be added to $Delay to derive the actual delay used.
# E.g. Delay = 5 and MinDelay = 3, then the actual delay will be
# between 3 and 8 seconds,
#
# get_nice() is the function to call, however
# XMLTV::Get_nice::get_nice_aux() is the one to cache with
# XMLTV::Memoize or whatever.  If you want an HTML::Tree object use
# get_nice_tree().
# Alternatively, get_nice_json() will get you a JSON object,
# or get_nice_xml() will get a XML::Parser 'Tree' object

use strict;

package XMLTV::Get_nice;

# use version number for feature detection:
# 0.005065 : new methods get_nice_json(), get_nice_xml()
# 0.005065 : add decode option to get_nice_tree()
# 0.005065 : expose the LWP response object ($Response)
# 0.005066 : support unknown tags in HTML::TreeBuilder ($IncludeUnknownTags)
# 0.005067 : new method post_nice_json()
# 0.005070 : skip get_nice sleep for cached pages
# 0.005070 : support passing HTML::TreeBuilder options via a hashref
our $VERSION = 0.005070;

use base 'Exporter';
our @EXPORT = qw(get_nice get_nice_tree get_nice_xml get_nice_json post_nice_json error_msg);
use Encode qw(decode);
use LWP::UserAgent;
use XMLTV;
our $Delay = 5; # in seconds
our $MinDelay = 0; # in seconds
our $FailOnError = 1; # Fail on fetch error
our $Response; # LWP response object
our $IncludeUnknownTags = 0; # add support for HTML5 tags which are unknown to older versions of TreeBuilder (and therfore ignored by it)


our $ua = LWP::UserAgent->new;
$ua->agent("xmltv/$XMLTV::VERSION");
$ua->env_proxy;
our %errors = ();


sub error_msg($) {
    my ($url) = @_;
    $errors{$url};
}
sub get_nice( $ ) {
    # This is to ensure scalar context, to work around weirdnesses
    # with Memoize (I just can't figure out how SCALAR_CACHE and
    # LIST_CACHE relate to each other, with or without MERGE).
    #
    return scalar get_nice_aux($_[0]);
}

# Fetch page and return as HTML::Tree object.
# Optional arguments:
#   i) a function to put the page data through (eg, to clean up bad characters)
#      before parsing.
#  ii) convert incoming page to UNICODE using this codepage (use "UTF-8" for
#      strict utf-8)
# iii) a hashref containing options to configure the HTML::TreeBuilder object
#      before parsing
#
sub get_nice_tree( $;$$$ ) {
    my ($uri, $filter, $codepage, $htb_opts) = @_;
    require HTML::TreeBuilder;
    my $content = get_nice $uri;
    $content = $filter->($content) if $filter;
    if ($codepage) {
        $content = decode($codepage, $content);
    }
    else {
        $content = decode('UTF-8', $content);
    }

    my $t = HTML::TreeBuilder->new();
    $t->ignore_unknown(!$IncludeUnknownTags);

    if (ref $htb_opts eq 'HASH') {
        $t->$_($htb_opts->{$_}) foreach (keys %$htb_opts);
    }

    $t->parse($content) or die "cannot parse content of $uri\n";
    $t->eof;
    return $t;
}

# Fetch page and return as XML::Parser 'Tree' object.
# Optional arguments:
# i) a function to put the page data through (eg, to clean up bad
# characters) before parsing.
# ii) convert incoming page to UNICODE using this codepage (use "UTF-8" for  strict utf-8)
#
sub get_nice_xml( $;$$ ) {
    my ($uri, $filter, $codepage) = @_;
    require XML::Parser;
    my $content = get_nice $uri;
    $content = $filter->($content) if $filter;
    if ($codepage) {
      $content = decode($codepage, $content);
    }
    else {
      $content = decode('UTF-8', $content);
    }
    my $t = XML::Parser->new(Style => 'Tree')->parse($content) or die "cannot parse content of $uri\n";
    return $t;
}

# Fetch page and return as JSON object.
# Optional arguments:
# i) a function to put the page data through (eg, to clean up bad
# characters) before parsing.
# ii) convert incoming UTF-8 to UNICODE
#
sub get_nice_json( $;$$ ) {
    my ($uri, $filter, $utf8) = @_;
    require JSON;
    my $content = get_nice $uri;
    $content = $filter->($content) if $filter;
    $utf8 = defined $utf8 ? 1 : 0;
    my $t = JSON->new()->utf8($utf8)->decode($content) or die "cannot parse content of $uri\n";
    return $t;
}

my $last_get_time;
my $last_get_from_cache;
sub get_nice_aux( $ ) {
    my $url = shift;

    if (defined $last_get_time && (defined $last_get_from_cache && !$last_get_from_cache) ) {
        # A page has already been retrieved recently.  See if we need
        # to sleep for a while before getting the next page - being
        # nice to the server.
        #
        my $next_get_time = $last_get_time + (rand $Delay) + $MinDelay;
        my $sleep_time = $next_get_time - time();
        sleep $sleep_time if $sleep_time > 0;
    }

    my $r = $ua->get($url);

    # Then start the delay from this time on the next fetch - so we
    # make the gap _between_ requests rather than from the start of
    # one request to the start of the next.  This punishes modem users
    # whose individual requests take longer, but it also punishes
    # downloads that take a long time for other reasons (large file,
    # slow server) so it's about right.
    #
    $last_get_time = time();

    # expose the response object for those grabbers which need to process the headers, status code, etc.
    $Response = $r;

    # Set flag if last fetch was from local HTTP::Cache::Transparent cache.
    # Check for presence of both x-content-unchanged and x-cached headers.
    $last_get_from_cache = (defined $r->{'_headers'}{'x-content-unchanged'}
                         && defined $r->{'_headers'}{'x-cached'}
                         && $r->{'_headers'}{'x-cached'} == 1);

    if ($r->is_error) {
        # At the moment download failures seem rare, so the script dies if
        # any page cannot be fetched.  We could later change this routine
        # to return undef on failure.  But dying here makes sure that a
        # failed page fetch doesn't get stored in XMLTV::Memoize's cache.
        #
        die "could not fetch $url, error: " . $r->status_line . ", aborting\n" if $FailOnError;
        $errors{$url} = $r->status_line;
        return undef;
    } else {
        return $r->content;
    }

}

# Fetch page via a JSON object in the Content and return as a JSON object.
# Arguments:
#    URI to post to
#    JSON object with the AJAX data to be posted e.g. "{ 'programId':'123456', 'channel':'BBC'}"
#
sub post_nice_json( $$ ) {
    my $url = shift;
    my $json = shift;

    require JSON;

    if (defined $last_get_time) {
        # A page has already been retrieved recently.  See if we need
        # to sleep for a while before getting the next page
        #
        my $next_get_time = $last_get_time + (rand $Delay) + $MinDelay;
        my $sleep_time = $next_get_time - time();
        sleep $sleep_time if $sleep_time > 0;
    }

    my $r = $ua->post($url, 'Content_Type' => 'application/json; charset=utf-8', 'Content' => $json);

    $last_get_time = time();

    # expose the response object for those grabbers which need to process the headers, status code, etc.
    $Response = $r;

    if ($r->is_error) {
        die "could not fetch $url, error: " . $r->status_line . ", aborting\n" if $FailOnError;
        $errors{$url} = $r->status_line;
        return undef;
    } else {
        my $content = JSON->new()->utf8(1)->decode($r->content) or die "cannot parse content of $url\n";
        return $content;
    }
}

1;