Tuesday, December 25, 2012

Wizard of ID fetch script.

First argument -- from date, format yyyymm, second argument, to date format -- yyyymm
Third argument -- no. of parallel fetches
e.g. --
./fetch_id 200801 2001301 4


#! /bin/bash
# Fetch wizard of ID script.
# fetching pattern --
# wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/2012/12/
# wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/2012/11/
# wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/2012/10/
# wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/2011/10/
# DO NOT run any other wget instance
# First argument -- from date, format yyyymm, second argument, to date format -- same
# Third argument -- no. of parallel fetches
declare -i year_start=${1:0:4}
declare -i month_start=${1:4:2}
declare -i year_end=${2:0:4}
declare -i month_end=${2:4:2}
# vars For internal use
# Can be removed -- *_start serves the same purpose
declare -i cur_year=$year_start
declare -i cur_month=$month_start
# no. of threads to be run
declare -i t=$3
# no. of fetch instances currently running
declare -i threads
chk_arg () {
#  Checks the argument lightly
 if [[ $year_start == "" || $year_end == "" || $month_end == "" || $month_start == "" || $t == "" || $month_start -gt 12 || $month_start -lt 1 || $month_end -gt 12 || $month_end -lt 1 || $year_start -lt 2007 || $year_end -lt 2007 ]]
 then
  echo "Wrong date start or date end arguments or missing arguments"
  exit
 fi
 if [[ $(pgrep wget | wc --lines) -ne 0 ]]
 then
  echo "Wget running in background, pid $(pgrep wget) first quit that"
 fi
}
fetch () {
#  reads variable cur_year and cur_month and fetch the corresponding page. Add additional 0s to cur_month if it's < 10
 echo "In progress, year $cur_year and month $cur_month"
 if [[ $cur_month -lt 10 ]]
 then
  wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/$cur_year/0$cur_month/ &> /tmp/$cur_year$cur_month.log
 else
  wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/$cur_year/$cur_month/ &> /tmp/$cur_year$cur_month.log
 fi
}
manage () {
#  Manages calls to fetch()
 while [[ $cur_year -le $year_end ]]
 do
  #  If cur_year == year_end, tally the months to see if cur_month > month_end, if so, exit
  if [[ $cur_year == $year_end ]]
  then
   if [[ $cur_month -gt $month_end ]]
   then
    echo "Wget is probably still running in background. Check running processes and wait till all wget instances finish"
    exit
   fi
  fi
  threads=$(pgrep wget | wc --lines)
  if [[ $threads -lt $t ]]
  then
   fetch &
 #   increment cur_year and/or cur_month
   if [[ $cur_month -lt 12 ]]
   then
    cur_month=cur_month+1
   else
    cur_month=1
    cur_year=cur_year+1
   fi
  else
   sleep 1
  fi
 done
}
chk_arg
manage