Third argument -- no. of parallel fetches
e.g. --
./fetch_id 200801 2001301 4
#! /bin/bash # Fetch wizard of ID script. # fetching pattern -- # wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/2012/12/ # wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/2012/11/ # wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/2012/10/ # wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/2011/10/ # DO NOT run any other wget instance # First argument -- from date, format yyyymm, second argument, to date format -- same # Third argument -- no. of parallel fetches declare -i year_start=${1:0:4} declare -i month_start=${1:4:2} declare -i year_end=${2:0:4} declare -i month_end=${2:4:2} # vars For internal use # Can be removed -- *_start serves the same purpose declare -i cur_year=$year_start declare -i cur_month=$month_start # no. of threads to be run declare -i t=$3 # no. of fetch instances currently running declare -i threads chk_arg () { # Checks the argument lightly if [[ $year_start == "" || $year_end == "" || $month_end == "" || $month_start == "" || $t == "" || $month_start -gt 12 || $month_start -lt 1 || $month_end -gt 12 || $month_end -lt 1 || $year_start -lt 2007 || $year_end -lt 2007 ]] then echo "Wrong date start or date end arguments or missing arguments" exit fi if [[ $(pgrep wget | wc --lines) -ne 0 ]] then echo "Wget running in background, pid $(pgrep wget) first quit that" fi } fetch () { # reads variable cur_year and cur_month and fetch the corresponding page. Add additional 0s to cur_month if it's < 10 echo "In progress, year $cur_year and month $cur_month" if [[ $cur_month -lt 10 ]] then wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/$cur_year/0$cur_month/ &> /tmp/$cur_year$cur_month.log else wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/$cur_year/$cur_month/ &> /tmp/$cur_year$cur_month.log fi } manage () { # Manages calls to fetch() while [[ $cur_year -le $year_end ]] do # If cur_year == year_end, tally the months to see if cur_month > month_end, if so, exit if [[ $cur_year == $year_end ]] then if [[ $cur_month -gt $month_end ]] then echo "Wget is probably still running in background. Check running processes and wait till all wget instances finish" exit fi fi threads=$(pgrep wget | wc --lines) if [[ $threads -lt $t ]] then fetch & # increment cur_year and/or cur_month if [[ $cur_month -lt 12 ]] then cur_month=cur_month+1 else cur_month=1 cur_year=cur_year+1 fi else sleep 1 fi done } chk_arg manage
No comments:
Post a Comment