Third argument -- no. of parallel fetches
e.g. --
./fetch_id 200801 2001301 4
#! /bin/bash
# Fetch wizard of ID script.
# fetching pattern --
# wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/2012/12/
# wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/2012/11/
# wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/2012/10/
# wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/2011/10/
# DO NOT run any other wget instance
# First argument -- from date, format yyyymm, second argument, to date format -- same
# Third argument -- no. of parallel fetches
declare -i year_start=${1:0:4}
declare -i month_start=${1:4:2}
declare -i year_end=${2:0:4}
declare -i month_end=${2:4:2}
# vars For internal use
# Can be removed -- *_start serves the same purpose
declare -i cur_year=$year_start
declare -i cur_month=$month_start
# no. of threads to be run
declare -i t=$3
# no. of fetch instances currently running
declare -i threads
chk_arg () {
# Checks the argument lightly
if [[ $year_start == "" || $year_end == "" || $month_end == "" || $month_start == "" || $t == "" || $month_start -gt 12 || $month_start -lt 1 || $month_end -gt 12 || $month_end -lt 1 || $year_start -lt 2007 || $year_end -lt 2007 ]]
then
echo "Wrong date start or date end arguments or missing arguments"
exit
fi
if [[ $(pgrep wget | wc --lines) -ne 0 ]]
then
echo "Wget running in background, pid $(pgrep wget) first quit that"
fi
}
fetch () {
# reads variable cur_year and cur_month and fetch the corresponding page. Add additional 0s to cur_month if it's < 10
echo "In progress, year $cur_year and month $cur_month"
if [[ $cur_month -lt 10 ]]
then
wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/$cur_year/0$cur_month/ &> /tmp/$cur_year$cur_month.log
else
wget -H -e robots=off -p http://www.johnhartstudios.com/wizardofid/$cur_year/$cur_month/ &> /tmp/$cur_year$cur_month.log
fi
}
manage () {
# Manages calls to fetch()
while [[ $cur_year -le $year_end ]]
do
# If cur_year == year_end, tally the months to see if cur_month > month_end, if so, exit
if [[ $cur_year == $year_end ]]
then
if [[ $cur_month -gt $month_end ]]
then
echo "Wget is probably still running in background. Check running processes and wait till all wget instances finish"
exit
fi
fi
threads=$(pgrep wget | wc --lines)
if [[ $threads -lt $t ]]
then
fetch &
# increment cur_year and/or cur_month
if [[ $cur_month -lt 12 ]]
then
cur_month=cur_month+1
else
cur_month=1
cur_year=cur_year+1
fi
else
sleep 1
fi
done
}
chk_arg
manage
No comments:
Post a Comment