blob: 3f026a6046b2db581742ce929df5f4c4ea23c0d8 [file] [log] [blame]
# Part of get-flash-videos. See get_flash_videos for copyright.
package FlashVideo::Generic;
use strict;
use FlashVideo::Utils;
use URI;
use FlashVideo::URLFinder;
use URI::Escape qw(uri_unescape);
use HTML::Entities qw(decode_entities);
our $VERSION = '0.01';
sub Version() { $VERSION; }
my $video_re = qr!http[-:/a-z0-9%_.?=&]+@{[EXTENSIONS]}
# Grab any params that might be used for auth..
(?:\?[-:/a-z0-9%_.?=&]+)?!xi;
sub find_video {
my ($self, $browser, $embed_url, $prefs) = @_;
# First strategy - identify all the Flash video files, and download the
# biggest one. Yes, this is hacky.
if (!$browser->success) {
$browser->get($browser->response->header('Location'));
die "Couldn't download URL: " . $browser->response->status_line
unless $browser->success;
}
my ($possible_filename, $actual_url, $title);
$title = extract_title($browser);
my @flv_urls = map {
(m{http://.+?(http://.+?@{[EXTENSIONS]})}i) ? $1 : $_
} ($browser->content =~ m{($video_re)}gi);
if (@flv_urls) {
require LWP::Simple;
require Memoize;
Memoize::memoize("LWP::Simple::head");
@flv_urls = sort { (LWP::Simple::head($a))[1] <=> (LWP::Simple::head($b))[1] } @flv_urls;
$possible_filename = (split /\//, $flv_urls[-1])[-1];
# Un-escape URLs if necessary
if ($flv_urls[-1] =~ /^http%3a%2f%2f/) {
$flv_urls[-1] = uri_unescape($flv_urls[-1])
}
$actual_url = url_exists($browser->clone, $flv_urls[-1]);
}
my $filename_is_reliable;
if(!$actual_url) {
RE: for my $regex(
qr{(?si)<embed.*?flashvars=["']?([^"'>]+)},
qr{(?si)<embed.*?src=["']?([^"'>]+)},
qr{(?si)<a[^>]* href=["']?([^"'>]+?@{[EXTENSIONS]})},
qr{(?si)<object[^>]*>.*?<param [^>]*value=["']?([^"'>]+)},
qr{(?si)<object[^>]*>(.*?)</object>},
# Attempt to handle scripts using flashvars / swfobject
qr{(?si)<script[^>]*>(.*?)</script>}) {
for my $param($browser->content =~ /$regex/gi) {
(my $url, $possible_filename, $filename_is_reliable) = find_file_param($browser->clone, $param, $prefs);
if($url) {
my $resolved_url = url_exists($browser->clone, $url);
if($resolved_url) {
$actual_url = $resolved_url;
last RE;
}
}
}
}
if(!$actual_url) {
for my $iframe($browser->content =~ /<iframe[^>]+src=["']?([^"'>]+)/gi) {
$iframe = decode_entities($iframe);
$iframe = URI->new_abs($iframe, $browser->uri);
$iframe = decode_entities($iframe);
debug "Found iframe: $iframe";
my $sub_browser = $browser->clone;
$sub_browser->get($iframe);
# Recurse!
my($package, $possible_url) = FlashVideo::URLFinder->find_package($iframe, $sub_browser);
# Before fetching the url, give the package a chance
if($package->can("pre_find")) {
$package->pre_find($sub_browser);
}
info "Downloading $iframe";
$sub_browser->get($iframe);
my($actual_url, @suggested_fnames) = eval {
$package->find_video($sub_browser, $possible_url, $prefs);
};
return $actual_url, @suggested_fnames if $actual_url;
}
}
}
my @filenames;
return $actual_url, $possible_filename if $filename_is_reliable;
$possible_filename =~ s/\?.*//;
# The actual filename, provided it looks like it might be reasonable
# (not just numbers)..
push @filenames, $possible_filename if $possible_filename
&& $possible_filename !~ /^[0-9_.]+@{[EXTENSIONS]}$/;
# The title of the page, if it isn't similar to the filename..
my $ext = substr(($actual_url =~ /(@{[EXTENSIONS]})$/)[0], 1);
push @filenames, title_to_filename($title, $ext) if
$title && $title !~ /\Q$possible_filename\E/i;
# A title with just the timestamp in it..
push @filenames, get_video_filename() if !@filenames;
return $actual_url, @filenames if $actual_url;
# As a last ditch attempt, download the SWF file as in some cases, sites
# use an SWF movie file for each FLV.
# Get SWF URL(s)
my %swf_urls;
if (eval { require URI::Find }) {
my $finder = URI::Find->new(
sub { $swf_urls{$_[1]}++ if $_[1] =~ /\.swf$/i }
);
$finder->find(\$browser->content);
}
else {
# Extract URLs in a frail way.
my $content = $browser->content;
while($content =~ m{(http://[^ "']+?\.swf)}ig) {
$swf_urls{$1}++;
}
}
if (%swf_urls) {
foreach my $swf_url (keys %swf_urls) {
if (my ($flv_url, $title) = search_for_flv_in_swf($browser, $swf_url)) {
return $flv_url, title_to_filename($title);
}
}
}
die "No URLs found";
}
sub search_for_flv_in_swf {
my ($browser, $swf_url) = @_;
$browser = $browser->clone();
$browser->get($swf_url);
if (!$browser->success) {
die "Couldn't download SWF URL $swf_url: " .
$browser->response->status_line();
}
# SWF data might be compressed.
my $swf_data = $browser->content;
if ('C' eq substr $swf_data, 0, 1) {
if (eval { require Compress::Zlib }) {
$swf_data = Compress::Zlib::uncompress(substr $swf_data, 8);
}
else {
die "Compress::Zlib is required to uncompress compressed SWF files.\n";
}
}
if ($swf_data =~ m{(http://.{10,300}?\.flv)}i) {
my $flv_url = $1;
my $filename = uri_unescape(File::Basename::basename(URI->new($flv_url)->path()));
$filename =~ s/\.flv$//i;
return ($flv_url, $filename);
}
return;
}
sub find_file_param {
my($browser, $param, $prefs) = @_;
for my $file($param =~ /(?:video|movie|file|path)_?(?:href|src|url)?['"]?\s*[=:,]\s*['"]?([^&'" ]+)/gi,
$param =~ /(?:config|playlist|options)['"]?\s*[,:=]\s*['"]?(http[^'"&]+)/gi,
$param =~ /['"=](.*?@{[EXTENSIONS]})/gi,
$param =~ /([^ ]+@{[EXTENSIONS]})/gi,
$param =~ /SWFObject\(["']([^"']+)/) {
debug "Found $file";
my ($actual_url, $filename, $filename_is_reliable) = guess_file($browser, $file, '', $prefs);
if(!$actual_url && $file =~ /\?(.*)/) {
# Maybe we have query params?
debug "Trying query param on $1";
for my $query_param(split /[;&]/, $1) {
my($query_key, $query_value) = split /=/, $query_param;
debug "Found $query_value from $query_key";
($actual_url, $filename, $filename_is_reliable)
= guess_file($browser, $query_value, '', $prefs);
last if $actual_url;
}
}
if($actual_url) {
my $possible_filename = $filename || (split /\//, $actual_url)[-1];
return $actual_url, $possible_filename, $filename_is_reliable;
}
}
if($param =~ m{(rtmp://[^ &"']+)}) {
info "This looks like RTMP ($1), no generic support yet..";
}
return;
}
sub guess_file {
my($browser, $file, $once, $prefs) = @_;
# Contains lots of URI encoding, so try escaping..
$file = uri_unescape($file) if scalar(() = $file =~ /%[A-F0-9]{2}/gi) > 3;
my $orig_uri = URI->new_abs($file, $browser->uri);
info "Guessed $orig_uri trying...";
if($orig_uri) {
my $uri = url_exists($browser->clone, $orig_uri);
if($uri) {
# Check to see if this URL is for a supported site.
my ($package, $url) = FlashVideo::URLFinder->find_package($uri,
$browser->clone);
if($package && $package ne __PACKAGE__) {
debug "$uri is supported by $package.";
(my $browser_on_supported_site = $browser->clone())->get($uri);
return $package->find_video($browser_on_supported_site, $uri, $prefs), 1;
}
my $content_type = $browser->response->header("Content-type");
if($content_type =~ m!^(text|application/xml)!) {
# Just in case someone serves the video itself as text/plain.
$browser->add_header("Range", "bytes=0-10000");
$browser->get($uri);
$browser->delete_header("Range");
if(FlashVideo::Downloader->check_magic($browser->content)
|| $uri =~ m!$video_re!) {
# It's a video..
debug "Found a video at $uri";
return $uri;
}
# If this looks like HTML we have no hope of guessing right, so
# give up now.
return if $browser->content =~ /<html[^>]*>/i;
if($browser->content =~ m!($video_re)!) {
# Found a video URL
return $1;
} elsif(!defined $once
&& $browser->content =~ m!(http[-:/a-zA-Z0-9%_.?=&]+)!i) {
# Try once more, one level deeper..
return guess_file($browser, $1, 1, $prefs);
} else {
info "Tried $uri, but no video URL found";
}
} elsif($content_type =~ m!application/! && $uri ne $orig_uri) {
# We were redirected, maybe something in the new URL?
return((find_file_param($browser, $uri))[0]);
} else {
return $uri->as_string;
}
} elsif(not defined $once) {
# Try using the location of the .swf file as the base, if it's different.
if($browser->content =~ /["']([^ ]+\.swf)/) {
my $swf_uri = URI->new_abs($1, $browser->uri);
if($swf_uri) {
my $new_uri = URI->new_abs($file, $swf_uri);
debug "Found SWF: $swf_uri -> $new_uri";
if($new_uri ne $uri) {
return guess_file($browser, $new_uri, 1, $prefs);
}
}
}
}
}
return;
}
1;