#!/usr/bin/env zsh
# =================================================
# CREATED: qua set 21 13:40:53 BRT 2016
# Last Change: sex 23 set 2016 09:09:03 BRT
# THIS SCRIPT AIMS: get mairo's vergara flashcards
# AUTHOR: Sérgio Luiz Araújo Silva
# SITE: http://vivaotux.blogspot.com
# TWITTER: @voyeg3r
# SKYPE: sergioaraujosilva
# =================================================
#
# OBS:
# ========== Não esqueça do wgetrc ===
#
# ### Sample Wget initialization file .wgetrc by http://www.askapache.com
# ## Local settings (for a user to set in his $HOME/.wgetrc). It is
# ## *highly* undesirable to put these settings in the global file, since
# ## they are potentially dangerous to "normal" users.
# ##
# ## Even when setting up your own ~/.wgetrc, you should know what you
# ## are doing before doing so.
# header = Accept-Language: en-us,en;q=0.5
# header = Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
# header = Connection: keep-alive
# user_agent = Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2
# referer = http://www.askapache.com/
# robots = off
#
# Dependences to run this script:
# zsh perl-rename wget sed grep awk
setopt nonomatch
URL=$1
echo "$URL" > lesson-url.txt
# OBS: Maybe the biggest problem I came across was this stupid
# unseprable char: \xc2\xa --> see more on this post:
# http://askubuntu.com/questions/357248/how-to-remove-special-m-bm-character-with-sed
# URL="http://www.mairovergara.com/come-handy-o-que-significa-esta-expressao/"
# download dos textos
# wget -O - -o /dev/null http://www.mairovergara.com/come-handy-o-que-significa-esta-expressao/ | grep strong | sed 's/<[^>]*>//g' | sed 's/([^)]*)//g' | sed '/Download da Lição/,$d' | sed '/^$/d' | sed '1~2p' > text.txt
# wget -O - -o /dev/null "$URL" | grep strong | sed 's/<[^>]*>//g' | sed 's/([^)]*)//g' | sed '/Download da Lição/,$d' | sed '/^$/d' | sed '1~2s,.*,&\n[sound:&mp3];,g' | sed '2~2s, ,-,g' | awk '{print}; NR%3==0 {print ""}' | awk 'BEGIN {RS=""}; {$1=$1;print}' | sed 's,\.\s\+,.,g' > deck.csv
# old url: "http://www.mairovergara.com/rip-off-phrasal-verb-significado/"
wget -O - -o /dev/null "$URL" | grep strong | sed 's/<[^>]*>//g' | sed 's/([^)]*)//g' | sed -re '/Download da Lição/,$d' | sed '/Abaixo temos exemplos/d' | sed '/no sentido/d' | sed '/com o sentido/d' | sed '/exemplos abaixo/d' | sed '/^[0-9][^:]*:/d' | sed 's,“,,g' | sed "s/’/'/g" | sed 's,”,",g' | sed 's/\xc2\xa0/ /g' > deck.csv
cat deck.csv > tempdeck.csv
# cat tempdeck.csv | sed '/^$/d' | sed '1~2s,.*,&[sound:&mp3];,g' | awk '{print}; NR%2==0 {print ""}' | awk 'BEGIN {RS=""}; {$1=$1;print}'
# preciso de um awk para remover espaços de substrings
cat deck.csv | sed '/^$/d' | sed '/http.*/d' | sed '1~2s/.*/&\n[sound:&mp3];/g' | sed '2~3s/ /-/g' | sed '2~3s/,//g' | sed '2~3s/\!\(\mp3\)/.\1/g'| sed '2~3s/\?mp3/.mp3/g' | awk '{print}; NR%3==0 {print ""}' | awk 'BEGIN {RS=""}; {$1=$1;print}' | sed 's/\.\s\+/./g' | sed 's/\! /!/g' > deck.csv
# cat deck.csv | sed -i "s/\&\#8217;/'/g" deck.csv > tempdeck.csv
# wget -O - -o /dev/null http://www.mairovergara.com/come-handy-o-que-significa-esta-expressao/ | grep strong | sed 's/<[^>]*>//g' | sed 's/([^)]*)//g' | sed '/Download da Lição/,$d' | sed '/^$/d' | sed '1~2s,.*,&\n[sound:&mp3];,g' | awk '{print}; NR%3==0 {print ""}' | awk 'BEGIN {RS=""}; {$1=$1;print}' | sed 's,\.\s\+,.,g' > deck.csv
# cat text.txt | sed '1~2s,.*,&[sound:&mp3],g' > text.txt
# download dos audios
# wget -O - -o /dev/null http://www.mairovergara.com/come-handy-o-que-significa-esta-expressao/ | grep -o 'http://[^ >]*\.mp3' > links.txt
wget -O - -o /dev/null "$URL" | grep -o 'http://[^ >]*\.mp3' | sort -u > links-dos-audios.txt
# wget -O - -o /dev/null http://www.mairovergara.com/come-handy-o-que-significa-esta-expressao/ | grep -o 'http://[^ >]*\.mp3?_=1[^.<]' > links.txt
# donwload dos audios
echo "criando diretório mp3..."
mkdir mp3
cd mp3
wget -i ../links-dos-audios.txt
echo "Diretório atual: $PWD"
perl-rename 's/^\d+-//g' *
# for i in *.mp3.*; do
# rm -f $i
# done
# remover dígitos no começo dos arquivos mp3
# copiar audios para a pasta do anki
for i in *.mp3; cp $i ~/docs/Anki/sergio/collection.media/
cd ..
# documentation
# wget -O - -o /dev/null ............ baixa a página para o terminal sem fazer download
# grep strong ....................... pega as frases
# sed 's/<[^>]*>//g' ................ remove tags html
# sed 's/([^)]*)//g' ................ remove observações
# sed '/Download da Lição/,$d' ...... apaga até o final da página
# sed '/^$/d' ....................... apaga linhas vazias
# sed '1~2s/ /-/g' .................. troca espaços por traços nas linhas impares
A little script to get some flashcards from mairovergara.com.br
Be the first to comment
You can use [html][/html], [css][/css], [php][/php] and more to embed the code. Urls are automatically hyperlinked. Line breaks and paragraphs are automatically generated.