-----------------------------------------------------------------------------
-- |
-- Module      :  UrlCheck.hs
-- Copyright   :  (c) Don Stewart 2006
-- License     :  BSD3-style (see LICENSE)
-- 
-- Maintainer  :  dons@cse.unsw.edu.au
-- Stability   :  stable
-- Portability :  portable
--
-----------------------------------------------------------------------------

import Data.Char        (isControl)
import Data.List
import Text.Printf

import Data.ByteString.Lazy.Char8 (ByteString, pack, unpack)
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.Set as S
import qualified Data.Map as M

import Control.Monad.State

import Control.Concurrent
import Control.Exception  (handle, finally)

import System.Console.GetOpt
import System.Environment
import System.Exit
import System.IO
import System.Time

import TinyHTTP


data Job = Job ByteString | Done

main = time $ do
    (files,k) <- parseArgs
    proxy     <- getProxy
    let n = length files

    bad      <- newMVar (0 :: Int)
    errs     <- newChan
    jobs     <- newChan
    children <- newMVar []

    forkIO (writer errs)
    fork k children (thread errs jobs bad proxy)
    stats    <- execStateT (mapM_ check files) (empty jobs)
    replicateM_ k (writeChan jobs Done)
    wait children
    broken   <- takeMVar bad

    printf fmt broken
               (found stats)
               (S.size (cache stats))
               n
               (if n > 1 then "s" else "")
  where
    empty = UC S.empty 0
    fmt   = "Found %d broken links. Checked %d links (%d unique) in %d file%s.\n"

--
-- fork k children threads
--
fork k cs f = flip mapM_ [1..k] $ \n -> do
    mv <- newEmptyMVar
    modifyMVar_ cs (return . (mv :))
    forkIO (f n `finally` putMVar mv ())

--
-- just print out failed urls as they arrive
--
writer c = getChanContents c >>= mapM_ (\s -> putStrLn s >> hFlush stdout)

--
-- wait on a list of children threads
-- 
wait cs = do
   xs <- takeMVar cs
   case xs of
     []   -> return ()
     m:ms -> do
        putMVar cs ms
        takeMVar m
        wait cs

-- 
-- extract a list of urls from a file, and write them into the job queue
-- need to do this in smp-parallel fashion
--
check f = do
    src  <- io (B.readFile f)
    let urls = extract src
    bad  <- filterM seenURI urls
    sendJobs bad
    updateStats (length urls)

--
-- read jobs from the queue, check if they work on the network
--
thread errs queue bad proxy n = loop
  where
    loop = do
        job <- readChan queue
        case job of
            Done  -> return ()
            Job x -> run (B.unpack x) >> loop

    inc = modifyMVar_ bad (return . (+1))

    run url = case parseURI url of
        Just uri -> do
            mn <- handle (return . Left . show) (getStatus uri proxy)
            case mn of
                Right 200 -> return ()
                Right n   -> next (show n)
                Left err  -> next err
        _ -> next "Invalid URL"

        where next s = inc >> writeChan errs (url ++ " " ++ s)

--
-- Url cache type and statistics
--

data UC =
    UC { cache   :: S.Set ByteString,
         found   :: Int,
         queue   :: Chan Job }

updateStats a = modify $ \s ->
    s { found  = found  s + a }

insertURI c     = modify $ \s ->
    s { cache = S.insert c (cache s) }

seenURI u = do
    v <- (not . S.member u) `fmap` gets cache
    insertURI u
    return v

sendJobs js = do
    c <- gets queue
    io $ mapM_ (writeChan c . Job) js

io = liftIO

--
-- URI extraction
--

extract :: ByteString -> [ByteString]
extract = concatMap split . B.lines

split :: ByteString -> [ByteString]
split ln = uris ln

uris s = filter (\s -> not (B.null s) && looksOk s) (B.splitWith isDelim s)
  where
    isDelim c = isControl c || c `elem` " <>\"{}|\\^[]`"
    looksOk s = http `B.isPrefixOf` s
    http      = pack "http:"

--
-- Argument handling
--

data Flag = Help | N Int deriving Eq

options =
    [Option ['h'] ["help"] (NoArg Help)
        "Show this help message"
    ,Option ['n'] []       (ReqArg (\s -> N (read s)) "N")
        "Number of concurrent connections (default 16)" ]

parseArgs = do
    argv <- getArgs
    case parse argv of
        ([], fs, [])         -> return (nub fs, 16)
        (as, fs, [])
            | Help `elem` as              -> help
            | [N n] <- filter (/=Help) as -> return (nub fs, n)
        (_,_,errs)                        -> die errs
  where
    parse argv = getOpt Permute options argv
    header     = "Usage: urlcheck [-h] [-n n] [file ...]"
    info       = usageInfo header options
    dump       = hPutStrLn stderr
    die errs   = dump (concat errs ++ info) >> exitWith (ExitFailure 1)
    help       = dump info                  >> exitWith ExitSuccess

getProxy = handle (\_ -> return Nothing) $ do
    env <- M.fromList `fmap` getEnvironment
    return $! do
        s <- M.lookup "http_proxy" env
        a <- parseURI s
        v <- uriAuthority a
        let host = uriRegName v
            port = read (tail (uriPort v))
        return (host, port)

--
-- Time a computation
--
time a = do
    start <- getClockTime
    a
    end   <- getClockTime
    let diff = diffClockTimes end start
        s    = timeDiffToString diff
    printf "Search time: %s\n" s
