#!/usr/bin/perl

use warnings; use strict; use v5.14;

# config ==============================================================
my $blog                = "blogname"; # i.e. <this>.tumblr.com
my $max_images          = 500;
my $step                = 25; # get this many per req
my $output_dir          = "img/".$blog."_images";
my $index               = "$output_dir/index.txt";
my $max_consecutive_badreqs     = 50; #allow this many failed requests in a row	
my $max_image_size_kb           = 3 * 1024;	# 3mb
my $max_disk_space_mb;         	# set a disk space limit. default no limit.*
my $filenames   = 'numbered'; 	# 'numbered', 'original' or 'tumblr_id'
my %types       = ( png=>'png', gif=>'gif', jpeg=>'jpg' );
#======================================================================

use LWP::Simple;
use JSON;

mkdir $output_dir unless -d $output_dir;
open my $idx, ">>", $index or die "can't open $index for appending\n";
print $idx "Tumblr_id\tfilename\turl\n" unless -e $index;

my $photo_url_1280 	= "photo-url-1280"; # Don't ask.
my $url_with_slug 	= "url-with-slug";
my $posts_total		= "posts-total";

my ($last_id,$type) 	= (undef)x2;
my ($got,$badlink,$toobig,$cur,$total,$cs,$downloaded,$start) = (0)x99;
my ($max_disk_space, $blog_total_posts); 
if ($max_disk_space_mb) { $max_disk_space = $max_disk_space_mb*1024*1024; }
else 			{ $max_disk_space = ~0; 			  }
	
OUTER: while (1) {
	my $posts = get_posts($start, 1);
	foreach my $row (@$posts) {
		last OUTER if $cs >= $max_consecutive_badreqs;
		$last_id = $row->{id};
		++$total; # why increment total up here and not below?

		my $url = $row->{$photo_url_1280};
		my $permalink = $row->{$url_with_slug};

		# at this point we have a url with a valid extension or go home
		if ($url !~ /(png|jpg|jpeg|gif)$/) { 
			DIAG("BAD LINK: $url");
			++$cs; ++$badlink; next; 
		}
			
		my ($content_type,$size) = head $url;
		my $kbsize = int($size / 1024);
		if ($kbsize > $max_image_size_kb) { 
			DIAG("TOO BIG: $url"); 
			++$cs; ++$toobig; next; 
		} 

		my $original_filename = (split '/', $url)[-1];
		# remove old extension, we don't trust it 
		$original_filename =~ s/\.(png|jpg|jpeg|gif)$//;
		my ($type, $subtype) = split "/", $content_type;
		if ($type ne "image" or !$types{$subtype}) { 
			++$cs; ++$badlink; next; }

		# set filename
		my $filename = $filenames eq 'numbered' ? sprintf "%06d", $got : ($filenames eq 'original' ? $original_filename : $row->{id});
		# set correct extension based on file's MIME type
		$filename .= ".".$types{$subtype};

		DIAG("#$got $last_id\tdownloading $url to $filename");
		++$got;
		$downloaded += $size;
		$cs = 0;

		getstore $url, "$output_dir/$filename";
		print $idx "$last_id\t$filename\t$url\n";
		last OUTER if 	$got == $max_images  ||
				$downloaded > $max_disk_space ||
				$total >= $blog_total_posts;
	}
		$start += $step;			
}
if ($downloaded > $max_disk_space) {
	print "ABORTED: disk space over your limit of $max_disk_space_mb\n" 
} elsif ($cs >= $max_consecutive_badreqs) {
	print "ABORTED: $max_consecutive_badreqs requests in a row\n";
}
print "==============================================================\n";
print "Searched $total posts to retrieve $got images\n";
print "Skipped: $badlink bad links, $toobig too big\n";
print "$got images downloaded to $output_dir\n";

#===========================================================================
sub get_posts {
	my ($start, $test) = @_;

	my $url = "http://$blog.tumblr.com/api/read/json?start=$start&num=$step&type=photo";
	print "requesting $url\n===============================\n";
	my $str = get $url;

	#======================================================================
	# Fix Tumblr's "json", which is actually a file containing a single giant javascript variable. That's not how this works, Tumblr. That's not how any of this works.
	#
	# They also use variable names with dashes in them (like "photo-url-1280"), which have to be placed into a string and then interpolated back into the first variable to get at their value. If you're trying to get me to make a joke about how Tumblr coders learned to code from a Tumblr blog, guys, you're doing a great job.
	#======================================================================
	chomp $str;
	$str =~ s/var tumblr_api_read = //;
	$str = substr $str, 0, -1;

	my $json = decode_json $str;
	$blog_total_posts = $json->{$posts_total};
	return $json->{posts};
}
sub DIAG {
	print $_[0] . "\n";
}