#! /bin/bash
rm -Rf /tmp/fetching_links
# Takes in a file, and a number which defines how many links to be fetched in parallel. This'll also utilize the wget feature to reuse an existing TCP connection.
# e.g. ./fetch_parallel <path to list> <no. of threads (n)>
# The file taken in will be spitted into n parts and each part will be stored in /tmp/fetching_links (with text file named 0, 1, 2, 3 etc... depending on value of n).
# Each file will be fed to wget for fetching.
# third argument is optional. Third, if present will will refetch the downloads after n seconds (as specified by the third argument) and overwrite the existing files. If the argument is 'c', it'll refetch without waiting.
# Files will be stored in ./downloads
# variable to store all links
links="$(cat "$1")"
# Generate list of links (depending on no. of parallel fetches).
# variable each will contain the no. of lines to exist in each instance of wget.
declare -i each
each=$(($(echo "$links" | wc --lines)/$2))
# Helper index variable $num
declare -i num
num=1
#assign link list for each wget instance to process and split the file list in arrays of $links.
while [[ $(($num - 1)) != $2 ]]
do
links[$num]=$(echo "$links" | head -${each})
#trim list of files to allow second run.
#remaining lines (not divisible with each) will be in links[0]
links="$(echo "$links" | tail -n +$(($each+1)))"
num=num+1
done
#num will be used later as the no. of valid indexes in links. Thus reduce it's value to the right one.
num=num-1
mkdir /tmp/fetching_links
# index variable i will be used to echo lines in array links to /tmp/fetching_links, each will be read separately by wget.
declare -i i
i=0
# if links[0] = "" or \n, it means there were no left overs after the split of the link list. I need to make a mark of this.
if [[ ${links[0]} == "" || ${links[0]} == "
" ]]
then
# Set variable f as "empty" if links[0] was "". For future use.
f="empty"
fi
# num variable contains the no. of indexes available. For future use, it's copied over to another variable max.
declare -i max
max=$num
while [[ $num != -1 ]]
do
# File names will be same as the index variable in variable links
echo "${links[$num]}" > /tmp/fetching_links/$num
num=num-1
if [[ $f == "empty" ]]
then
if [[ $num == 0 ]]
then
break
fi
fi
done
# function which needs to be called again and again for repetitive fetching.
unset links
# Value of max will again be used, thus storing in num again.
declare -i num
num=$max
call_wget(){
# Resetting value of max in case of a recall.
max=$num
# calling of wget starts here. Using previous variable max and 'f'
while [[ $max != -1 ]]
do
wget -N --tries=3 --timeout=5 -U "Mozilla/5.0 (X11; U; Gentoo Linux x86_64; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/11.0.696.71 Safari/534.16" -i /tmp/fetching_links/$max &> /dev/null &
#debug
max=max-1
# file[0] will be missing if f==empty
if [[ $f == "empty" ]]
then
if [[ $max == 0 ]]
then
break
fi
fi
done
}
t=$3
# If $3="", wget will just run once.
if [[ $t > 0 || $t == "c" ]]
then
while [[ "1" != "2" ]] # An infinite loop
do
call_wget
echo "Fresh call"
# See if any wget's running in bg using jobs.
status="running"
while [[ "$status" == "running" ]]
do
jobs | grep Running &> /dev/null
if [[ $? == 0 ]]
then
status="running"
sleep 2
echo "running"
else
status=""
fi
done
# The sleep timer, or no wait, depending on value of $3.
if [[ $3 != "c" ]]
then
echo "waiting $3 seconds"
sleep $t
fi
done
else
call_wget
status="running"
while [[ "$status" == "running" ]]
do
jobs | grep Running &> /dev/null
if [[ $? == 0 ]]
then
status="running"
sleep 2
echo "running in background"
else
status=""
echo "All done."
fi
done
fi
rm -Rf /tmp/fetching_links
My blog will mostly talk about Desktop Linux & it's administration, general philosophy and software politics.
Friday, July 22, 2011
Parallel re-fetcher (using wget).
This script is a parallel re-fetcher (i.e. it can restart downloads removing the old files and updating them with a new ones). the comment says it all.
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment