- Home
- Contact
-
Articles / Code
- HTML/CSS (2)
-
Scripts (17)
- Twitter & bash
- Generate sitemaps
- random floating point in bash
- Gmail script
- Python http POST requests
- Bashrc enhancements
- commandline <> nautilus
- Word Definitions
- Synonyms
- mysqld monitor
- remote server
- links and emailaddresses
- Apache Analyser
- remote ipaddress
- OOP Python
- mysql tuning
- diskspace notification
- Server Configuration (6)
- ICT-security (2)
Retrieve links and emailaddresses from some webpage
» Articles / Code » Scripts » Retrieve links and emailaddresses from some webpage
This script started as a Bash coding exercise but turned out to be a very useful way of getting data from websites, and it contains many useful little coding tricks.
#!/bin/sh
##############################################
NAME_="webextract"
PURPOSE_="extract links and/or email-addresses from a webpage"
SYNOPSIS_="$NAME_ [-e ] [-l ] "
OPTIONS_="
-e extract all emailaddresses from a url
-l extract all links from url
-h show help (this)"
REQUIRES_="Curl"
VERSION_="0.9"
PUROSE_=""
#Created by Lx
#licence : GPL3
#################################################
usage () {
echo >&2 "
\033[1m$NAME_ $VERSION_ \033[0m - $PURPOSE_
Usage: $SYNOPSIS_
Options: $OPTIONS_
"
exit 1
}
# tmp file set up
tmp_1=/tmp/tmp.${RANDOM}$$
# signal trapping and tmp file removal
trap 'rm -f $tmp_1 >/dev/null 2>&1' 0
trap "exit 1" 1 2 3 15
extract () {
while getopts ":e:l:h:" optname
do
case "$optname" in
"e")
curl -s -S $OPTARG |{
tr ',;<>()"\47 ' '[\n*]' | sed -n -e 's/mailto://gI' -e '/@/p' > $tmp_1
cat $tmp_1
}
;;
"l")
curl -s -S $OPTARG |{
tr '<>"\47 ' '[\n*]' | sed -n -e 's/href=//gI' -e 's/src=//gI' -e '/http:/Ip' > $tmp_1
cat $tmp_1
}
;;
"h")
usage
;;
"?")
echo " Unknown option $OPTARG"
usage
;;
":")
usage
;;
*)
# should not occur
echo "unknown error"
;;
esac
done
return $OPTIND
}
showargs () {
for p in "$@"
do
echo "[$p]"
done
}
###
if [ $# != 0 ]; then
extract $@
else
usage
fi
Post your comment
Comments
No one has commented on this page yet.
RSS feed for comments on this page | RSS feed for all comments