#!/bin/sh # Thanks to # https://github.com/ttopholm/sitemap-generator/blob/master/sitemap_generator.sh # for ideas DOMAIN="pktsurf.in/smlinux/smlinux" ADDDATE="$(date -u --iso-8601=seconds)" RAWLINKSFILE="urls.txt" SANITISEDLINKSFILE="cleanurls.txt" XMLFILE="sitemap.xml" CHANGEFREQUENCY='weekly' PRIORITY='0.5' EXCLUDEDIRS="reports" EXCLUDEFILES="pkgresults" wget --wait=0.1 --spider --recursive --level=inf \ --no-verbose --output-file="$RAWLINKSFILE" --exclude-directories="$EXCLUDEDIRS" \ --reject "$EXCLUDEFILES" "$DOMAIN" # Grep URL from the file, remove URL: using awk from the second field, remove any white spaces # again from awk, again print the first field, use sort to print unique lines, use sed # to remove any empty lines and finally remove any line that starts with an ampersand grep -i URL "$RAWLINKSFILE" | awk -F 'URL:' '{print $2}' | awk '{$1=$1};1' |\ awk '{print $1}' | sort -u | sed '/^$/d' | sed 's@&@&@g' > "$SANITISEDLINKSFILE" XMLHEADER0='' XMLHEADER1='' echo $XMLHEADER0 > $XMLFILE echo $XMLHEADER1 >> $XMLFILE while read p; do case "$p" in http://*) echo ''$p''$ADDDATE''$CHANGEFREQUENCY''$PRIORITY'' >> $XMLFILE ;; *) ;; esac done < "$SANITISEDLINKSFILE" # Discard the sanitised links file #rm -f $RAWLINKSFILE $SANITISEDLINKSFILE # Finally close the xml file echo '' >> $XMLFILE