#! /bin/bashrm -Rf /tmp/fetching_links# Takes in a file, and a number which defines how many links to be fetched in parallel. This'll also utilize the wget feature to reuse an existing TCP connection.# e.g. ./fetch_parallel <path to list> <no. of threads (n)># The file taken in will be spitted into n parts and each part will be stored in /tmp/fetching_links (with text file named 0, 1, 2, 3 etc... depending on value of n).# Each file will be fed to wget for fetching.# third argument is optional. Third, if present will will refetch the downloads after n seconds (as specified by the third argument) and overwrite the existing files. If the argument is 'c', it'll refetch without waiting.# Files will be stored in ./downloads# variable to store all linkslinks="$(cat "$1")"# Generate list of links (depending on no. of parallel fetches).# variable each will contain the no. of lines to exist in each instance of wget.declare -i eacheach=$(($(echo "$links" | wc --lines)/$2))# Helper index variable $numdeclare -i numnum=1#assign link list for each wget instance to process and split the file list in arrays of $links.while [[ $(($num - 1)) != $2 ]]dolinks[$num]=$(echo "$links" | head -${each})#trim list of files to allow second run.#remaining lines (not divisible with each) will be in links[0]links="$(echo "$links" | tail -n +$(($each+1)))"num=num+1done#num will be used later as the no. of valid indexes in links. Thus reduce it's value to the right one.num=num-1mkdir /tmp/fetching_links# index variable i will be used to echo lines in array links to /tmp/fetching_links, each will be read separately by wget.declare -i ii=0# if links[0] = "" or \n, it means there were no left overs after the split of the link list. I need to make a mark of this.if [[ ${links[0]} == "" || ${links[0]} == "" ]]then# Set variable f as "empty" if links[0] was "". For future use.f="empty"fi# num variable contains the no. of indexes available. For future use, it's copied over to another variable max.declare -i maxmax=$numwhile [[ $num != -1 ]]do# File names will be same as the index variable in variable linksecho "${links[$num]}" > /tmp/fetching_links/$numnum=num-1if [[ $f == "empty" ]]thenif [[ $num == 0 ]]thenbreakfifidone# function which needs to be called again and again for repetitive fetching.unset links# Value of max will again be used, thus storing in num again.declare -i numnum=$maxcall_wget(){# Resetting value of max in case of a recall.max=$num# calling of wget starts here. Using previous variable max and 'f'while [[ $max != -1 ]]dowget -N --tries=3 --timeout=5 -U "Mozilla/5.0 (X11; U; Gentoo Linux x86_64; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/11.0.696.71 Safari/534.16" -i /tmp/fetching_links/$max &> /dev/null &#debugmax=max-1# file[0] will be missing if f==emptyif [[ $f == "empty" ]]thenif [[ $max == 0 ]]thenbreakfifidone}t=$3# If $3="", wget will just run once.if [[ $t > 0 || $t == "c" ]]thenwhile [[ "1" != "2" ]] # An infinite loopdocall_wgetecho "Fresh call"# See if any wget's running in bg using jobs.status="running"while [[ "$status" == "running" ]]dojobs | grep Running &> /dev/nullif [[ $? == 0 ]]thenstatus="running"sleep 2echo "running"elsestatus=""fidone# The sleep timer, or no wait, depending on value of $3.if [[ $3 != "c" ]]thenecho "waiting $3 seconds"sleep $tfidoneelsecall_wgetstatus="running"while [[ "$status" == "running" ]]dojobs | grep Running &> /dev/nullif [[ $? == 0 ]]thenstatus="running"sleep 2echo "running in background"elsestatus=""echo "All done."fidonefirm -Rf /tmp/fetching_links
My blog will mostly talk about Desktop Linux & it's administration, general philosophy and software politics.
Friday, July 22, 2011
Parallel re-fetcher (using wget).
This script is a parallel re-fetcher (i.e. it can restart downloads removing the old files and updating them with a new ones). the comment says it all.
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment