-- Search news pages
-- Public Domain - written by Rob Craig, February 2008
-- This demo reads numerous Web pages in parallel
-- and reports on the number of occurences of a word or phrase.
-- Each page is handled by a separate Euphoria task running
-- in parallel with several other tasks. Pages that contain
-- matches are displayed using your default web browser.
-- The search words are shown in red with italics in a very large font.
-- This demo uses Euphoria's new multitasking feature.
-- It creates multiple wget background processes, each retrieving one Web page.
-- You can get a version of wget for Windows from:
--
-- http://www.gnu.org/software/wget/wget.html
-- A Euphoria task is assigned to each instance of wget, searching the
-- Web page text as it arrives. In this way, when a task is blocked due
-- to a delayed response from a particular server, the program can easily
-- switch to another task that is not blocked. The program quits after a
-- period of 10-15 seconds with no progress made on any page.
-- News Sources - Add your favorite sites here...
sequence URLs -- good to have "/" at end of top level domain
URLs = {
"www.cbc.ca/news/",
"www.juancole.com/",
"www.abc.net.au/",
"abcnews.go.com/",
"english.aljazeera.net/HomePage",
"news.bbc.co.uk/",
"www.cbsnews.com/",
"cnn.com/",
"www.democracynow.org/index.pl",
"www.foxnews.com/",
"www.guardian.co.uk/",
"www.msnbc.msn.com/",
"www.reuters.com/",
"www.whatreallyhappened.com/",
"news.yahoo.com/"
}
include wildcard.e
include graphics.e
include dll.e
include machine.e
constant SW_SHOWNORMAL = 1
-- We use ShellExecute to open the HTML files using the default browser
atom shell32
shell32 = open_dll("shell32.dll")
if shell32 = NULL then
puts(1, "Couldn't find shell32.dll\n")
abort(1)
end if
integer ShellExecute
ShellExecute = define_c_proc(shell32, "ShellExecuteA",
{C_LONG, C_LONG, C_LONG, C_LONG, C_LONG, C_LONG})
if ShellExecute = -1 then
puts(1, "Couldn't link to ShellExecuteA\n")
abort(1)
end if
bk_color(0)
sequence cl
object search_phrase
cl = command_line()
if length(cl) >= 3 then
search_phrase = cl[3]
else
puts(1, "Enter search word or phrase:\n")
search_phrase = gets(0)
if atom(search_phrase) or length(search_phrase) < 2 then
abort(1)
elsif search_phrase[$] = '\n' then
search_phrase = search_phrase[1..$-1]
end if
end if
sequence null_device, del_cmd
if platform() = LINUX then
URLs = URLs[1..9] -- less room on screen
null_device = "/dev/null"
del_cmd = "rm"
else
null_device = "NUL"
del_cmd = "del"
end if
constant PREFIX = "",
POSTFIX = ""
function set_base_href(sequence fname, sequence url, sequence search_string)
-- insert base href
sequence newname
integer new, old, m, start
object line
newname = "mod" & fname
new = open(newname, "w")
old = open(fname, "r")
if new = -1 or old = -1 then
puts(2, "couldn't open file!\n")
return ""
end if
-- some pages need this:
puts(new, "\n")
-- copy file
while 1 do
line = gets(old)
if atom(line) then
exit
end if
start = 1
while 1 do
m = match_from(search_string, upper(line), start)
if m = 0 then
exit
end if
if m then
line = line[1..m-1] &
PREFIX &
search_string &
POSTFIX &
line[m+length(search_string)..$]
end if
start = m + length(search_string) + length(PREFIX) + length(POSTFIX)
end while
puts(new, line)
end while
close(old)
close(new)
return newname
end function
integer progress, quit
procedure task_search_url(sequence url, sequence string)
-- download a Web page and search it for a string
integer f, hits
integer line_count
object line
sequence mytemp, modtemp, ustring
atom html_file, open_str
text_color(YELLOW)
position(task_self()+1, 1)
printf(1, "task %2.0f: %-35s waiting for wget...", {task_self(), url})
ustring = upper(string)
hits = 0
-- run a copy of wget as a background process
mytemp = sprintf("newstemp%.0f.html", task_self())
system(sprintf("wget -q -b -O %s %s > %s", {mytemp, "http://" & url, null_device}), 2)
f = -1
while f = -1 do
-- wait until file exists
if quit then
return
end if
task_schedule(task_self(), {1.0, 2.0})
task_yield()
f = open(mytemp, "rb")
end while
text_color(BRIGHT_WHITE)
position(task_self()+1, 1)
printf(1, "task %2.0f: %-35s waiting for data...", {task_self(), url})
text_color(WHITE)
line_count = 0
while 1 do
line = gets(f)
if atom(line) then
-- could be actual end-of-file, or maybe there's more coming
task_schedule(task_self(), {1.0, 1.5})
while 1 do
line = gets(f)
if sequence(line) then
exit -- more data came in
end if
if quit then
close(f)
if hits then
-- display this page, but first insert base href
modtemp = set_base_href(mytemp, url, ustring)
html_file = allocate_string(modtemp) -- mytemp or url
open_str = allocate_string("open")
c_proc(ShellExecute, {0, open_str, html_file, 0, 0, SW_SHOWNORMAL})
end if
return -- we've been told to quit
end if
task_yield()
end while
end if
if match(ustring, upper(line)) then
hits += 1
end if
line_count += 1
if hits then
text_color(BRIGHT_GREEN)
else
text_color(BRIGHT_BLUE)
end if
position(task_self()+1, 1)
printf(1, "task %2.0f: %-35s matched %d lines out of %d ", {task_self(), url, hits, line_count})
text_color(WHITE)
progress = 1
-- this yield is not necessary, but it
-- lets you see the parallelism better
task_schedule(task_self(), 1)
task_yield()
end while
close(f)
end procedure
integer t
for i = 1 to length(URLs) do
t = task_create(routine_id("task_search_url"), {URLs[i], search_phrase})
task_schedule(t, 1)
end for
system(del_cmd & " newstemp*.html > " & null_device, 2)
clear_screen()
puts(1, "Looking for \"" & search_phrase & "\"\n")
atom time_out
time_out = time() + 45
task_schedule(0, {2.5, 3.0}) -- check the time every 2.5 to 3.0 seconds
quit = 0
while 1 do
progress = 0
task_yield()
if progress then
-- quit 10 seconds after no more lines are read
-- from any file by any task
time_out = time() + 10
else
position(length(URLs)+3, 1)
printf(1, "time remaining: %d seconds ", time_out - time())
if time() > time_out then
exit
end if
end if
end while
quit = 1 -- signal all tasks to report any final results and terminate
while length(task_list()) > 1 do
task_yield()
end while
position(length(URLs)+4, 1)
puts(1, "Press Enter to quit ...\n")
if getc(0) then
end if
system(del_cmd & " newstemp*.html > " & null_device, 2)
system(del_cmd & " modnewstemp*.html > " & null_device, 2)
system(del_cmd & " wget-log.* > " & null_device, 2)