#!/usr/bin/perl -w =pod =head1 NAME tv_remove_some_overlapping - Remove some overlapping programmes from XMLTV data. =head1 SYNOPSIS tv_remove_some_overlapping [--help] [--output FILE] [FILE...] =head1 DESCRIPTION Read one or more XMLTV files and write a file to standard output containing the same data, except that some 'magazine' programmes which seem to contain two or more other programmes are removed. For example, if 'Schools TV' runs from 10:00 to 12:00, and there are two programmes 'History' from 10:00 to 11:00 and 'Geography' from 11:00 to 12:00 on the same channel, then 'Schools TV' could be removed. A programme is removed only if there are two or more other programmes which partition its timeslot, which implies that it and these other programmes must have stop times specified. To avoid throwing away any real programmes, no programme will be discarded if it has content data other than title and URL. Filtering this tool won't remove all overlapping programmes but it will deal with the 'big magazine programme containing smaller programmes' data commonly seen from listings sources. B<--output FILE> write to FILE rather than standard output =head1 SEE ALSO L. =head1 AUTHOR Ed Avis, ed@membled.com =cut use strict; use XMLTV; use XMLTV::Version "$XMLTV::VERSION"; use XMLTV::Date; use Getopt::Long; use Date::Manip; BEGIN { if (int(Date::Manip::DateManipVersion) >= 6) { Date::Manip::Date_Init("SetDate=now,UTC"); } else { Date::Manip::Date_Init("TZ=UTC"); } } # Use Log::TraceMessages if installed. BEGIN { eval { require Log::TraceMessages }; if ($@) { *t = sub {}; *d = sub { '' }; } else { *t = \&Log::TraceMessages::t; *d = \&Log::TraceMessages::d; Log::TraceMessages::check_argv(); } } use XMLTV::Usage < 1); $boring_programme_key{$_} = 1 foreach map { $_->[0] } @XMLTV::Programme_Attributes; $boring_programme_key{url} = 1; # common in some sources, and boring my ($opt_help, $opt_output); GetOptions('help' => \$opt_help, 'output=s' => \$opt_output) or usage(0); usage(1) if $opt_help; @ARGV = ('-') if not @ARGV; my %w_args = (); if (defined $opt_output) { my $fh = new IO::File ">$opt_output"; die "cannot write to $opt_output\n" if not $fh; %w_args = (OUTPUT => $fh); } # Unfortunately we need to load the whole file before processing. I # don't want to require the input to be sorted, since tv_sort adds # guessed stop times which could cause this program to remove too many # programmes. This is another reason to eventually change tv_sort not # to add stop times (move into tv_guess_stop_times or whatever). # my ($encoding, $credits, $ch, $progs) = @{XMLTV::parsefiles(@ARGV)}; my $w = new XMLTV::Writer(%w_args, encoding => $encoding); $w->start($credits); $w->write_channels($ch); # Since zero-length and unknown-length programmes are always written # unchanged, we could write them immediately and discard them. But # it's a bit nicer to write the output in the same order as the input. # However, we don't bother to index these programmes, they cannot / # should not be used in looking for partitionings. # my %by_channel_and_start; foreach (@$progs) { push @{$by_channel_and_start{$_->{channel}}{pd $_->{start}}}, $_ if interesting $_; } $w->write_programme($_) foreach grep { should_write($_) } @$progs; $w->end(); exit(); # Given that %by_channel_and_start and %boring_programme_key have been # set up, should a programme (with start and stop time) be written? # sub should_write( $ ) { my $p = shift; # Always write zero length and unknown-length programmes. return 1 if not interesting $p; # If this programme cannot be partitioned by at least two others, # definitely write it. # return 1 unless exists_partition(pd $p->{start}, pd $p->{stop}, $p->{channel}, 2, { $p => 1 }); foreach (keys %$p) { if (not $boring_programme_key{$_}) { warn <{start} on $p->{channel} because it has $_ END ; return 1; } } return 0; } # We process only programmes with stop time and nonzero length. sub interesting( $ ) { my $p = shift; my $stop = $p->{stop}; return 0 if not defined $stop; my $cmp = Date_Cmp(pd $p->{start}, pd $stop); if ($cmp < 0) { # start < stop, okay. return 1; } elsif ($cmp == 0) { # Zero length, won't consider. return 0; } elsif ($cmp > 0) { warn "programme on $p->{channel} " . "with start time ($p->{start}) " . "before stop time ($stop)\n"; return 0; } else { die } } # Does there exist a sequence of programmes hopping from $start to # $stop, where none of the programmes is in $used and the sequence is # at least $min_length long? # # $start and $stop are Date::Manip objects, $used a hash whose keys # are used programmes and whose values are true. # sub exists_partition( $$$$$ ) { my ($start, $stop, $channel, $min_length, $used) = @_; # local $Log::TraceMessages::On = 1; t "seeking at least $min_length $start to $stop on $channel"; t '... not including: ' . d $used; my $cmp = Date_Cmp($start, $stop); if ($cmp < 0) { t 'start before stop, okay'; my @poss = grep { not $used->{$_} } @{$by_channel_and_start{$channel}{$start}}; t 'possible first steps of path: ' . d \@poss; --$min_length if $min_length; foreach my $p (@poss) { return 1 if exists_partition(pd $p->{stop}, $stop, $channel, $min_length, { %$used, $p => 1 }); } t 'no paths found'; return 0; } elsif ($cmp == 0) { t 'zero length, so path of length zero'; return not $min_length; } elsif ($cmp > 0) { t 'stop < start, so no path'; return 0; } else { die } } # Lift parse_date() to handle undef. sub pd( $ ) { for ($_[0]) { return undef if not defined; return parse_date($_); } } exit 1;