From df30fbed23fe07a4f515109dd80a940e0ac56c0e Mon Sep 17 00:00:00 2001 From: Raffaele Mignone Date: Sun, 24 Jan 2021 14:26:07 +0000 Subject: [PATCH] First release Addition of the CLI for the generation of podcast feeds of Ad Alta Voce. This first version includes the commands: - `single` generate a podcast feed of the given audiobook - `all` generate a podcast feed for all audiobooks --- README.md | 48 +++++++++++++++ ad-alta-voce.cabal | 51 ++++++++++++---- app/Main.hs | 15 ++++- data/templates/podcast.mustache | 29 +++++++++ package.yaml | 15 ++++- src/Command/All.hs | 46 ++++++++++++++ src/Command/CLI.hs | 67 +++++++++++++++++++++ src/Command/Single.hs | 58 ++++++++++++++++++ src/Lib.hs | 6 -- src/Scraper/Audiobook.hs | 78 ++++++++++++++++++++++++ src/Scraper/Playlist.hs | 45 ++++++++++++++ src/Types.hs | 103 ++++++++++++++++++++++++++++++++ 12 files changed, 540 insertions(+), 21 deletions(-) create mode 100644 data/templates/podcast.mustache create mode 100644 src/Command/All.hs create mode 100644 src/Command/CLI.hs create mode 100644 src/Command/Single.hs delete mode 100644 src/Lib.hs create mode 100644 src/Scraper/Audiobook.hs create mode 100644 src/Scraper/Playlist.hs create mode 100644 src/Types.hs diff --git a/README.md b/README.md index 2d12297..9911540 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,51 @@ # Ad Alta Voce Script per la generazione dei Feed Podcast di [Ad Alta Voce](https://www.raiplayradio.it/programmi/adaltavoce/). + +## Utilizzo + +### Installazione + +Una volta scaricato il repository è possibile installare il programma tramite `stack` attraverso il seguente comando: + +```bash +stack install +``` + +L'eseguibile può essere chiamato attraverso il comando `loud`. + +### Generazione di un singolo podcast + +È possibile generare il feed podcast di un audiobook tramite il seguente comando: + +```bash +loud single +``` + +Una lista di tutti gli audiolibri di Ad Alta Voce può essere recuperata al seguente [link](https://www.raiplayradio.it/programmi/adaltavoce/archivio/audiolibri/tutte/). + +### Generazione di tutti i podcast + +È possibile generare in automatico tutti i feed podcast degli audiobook attraverso il seguente comando: + +```bash +loud all +``` + +### Help page + +È possibile visualizzare le opzioni agiunti attraverso la *help page* richiamabile attraverso il seguente comando: + +```bash +loud --help +``` + +## Utilizzo tramite stack + +Alternativamente è possibile utilizzare il `loud` attraverso `stack` senza doverlo installare. +In questo caso i comandi sono: + +```bash +stack run -- single +stack run -- all +``` diff --git a/ad-alta-voce.cabal b/ad-alta-voce.cabal index 595cfd3..4d2fb04 100644 --- a/ad-alta-voce.cabal +++ b/ad-alta-voce.cabal @@ -5,36 +5,51 @@ cabal-version: 1.12 -- see: https://github.com/sol/hpack name: ad-alta-voce -version: 0.1.0.0 -description: Please see the README on GitHub at -homepage: https://github.com/githubuser/ad-alta-voce#readme -bug-reports: https://github.com/githubuser/ad-alta-voce/issues -author: Author name here -maintainer: example@example.com -copyright: 2021 Author name here -license: BSD3 +version: 0.0.0.1 +description: Script per la generazione dei Feed Podcast di Ad Alta Voce +homepage: https://github.com/norangebit/ad-alta-voce#readme +bug-reports: https://github.com/norangebit/ad-alta-voce/issues +author: Raffaele Mignone +maintainer: git@norangeb.it +copyright: 2021 norangebit +license: GPL3 license-file: LICENSE build-type: Simple extra-source-files: README.md ChangeLog.md +data-files: + templates/podcast.mustache +data-dir: data source-repository head type: git - location: https://github.com/githubuser/ad-alta-voce + location: https://github.com/norangebit/ad-alta-voce library exposed-modules: - Lib + Command.All + Command.CLI + Command.Single + Scraper.Audiobook + Scraper.Playlist + Types other-modules: Paths_ad_alta_voce hs-source-dirs: src build-depends: base >=4.7 && <5 + , directory + , mustache + , optparse-applicative + , parsec + , scalpel + , text + , time default-language: Haskell2010 -executable ad-alta-voce-exe +executable loud main-is: Main.hs other-modules: Paths_ad_alta_voce @@ -44,6 +59,13 @@ executable ad-alta-voce-exe build-depends: ad-alta-voce , base >=4.7 && <5 + , directory + , mustache + , optparse-applicative + , parsec + , scalpel + , text + , time default-language: Haskell2010 test-suite ad-alta-voce-test @@ -57,4 +79,11 @@ test-suite ad-alta-voce-test build-depends: ad-alta-voce , base >=4.7 && <5 + , directory + , mustache + , optparse-applicative + , parsec + , scalpel + , text + , time default-language: Haskell2010 diff --git a/app/Main.hs b/app/Main.hs index de1c1ab..1ea2c1e 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -1,6 +1,17 @@ +{-| +Module : Main +Copyright : (c) Raffaele Mignone 2021 +License : GPL-3 +Maintainer : git@norangeb.it +-} + module Main where -import Lib +import Options.Applicative ( execParser ) +import Command.CLI ( commandParserInfo, execute ) main :: IO () -main = someFunc +main = do + cliCommand <- execParser commandParserInfo + execute cliCommand + \ No newline at end of file diff --git a/data/templates/podcast.mustache b/data/templates/podcast.mustache new file mode 100644 index 0000000..977bb40 --- /dev/null +++ b/data/templates/podcast.mustache @@ -0,0 +1,29 @@ + + + {{audiobook-title}} + {{base-url}} + + https://www.raiplayradio.it/{{audiobook-cover-url}} + {{audiobook-cover-title}} + {{base-url}} + + {{audiobook-description}} + it-it + {{pub-day}} + Ad Alta Voce - Rai Radio 3 + {{audiobook-description}} + Ad Alta Voce + No + + {{pub-day}} + {{#episodes}} + + {{episode-title}} + https://www.raiplayradio.it/{{episode-url}} + + {{episode-track-url}} + {{episode-duration}} + + {{/episodes}} + + diff --git a/package.yaml b/package.yaml index 4a3d423..0f2140a 100644 --- a/package.yaml +++ b/package.yaml @@ -1,5 +1,5 @@ name: ad-alta-voce -version: 0.0.0.0 +version: 0.0.0.1 github: "norangebit/ad-alta-voce" license: GPL3 author: "Raffaele Mignone" @@ -10,6 +10,10 @@ extra-source-files: - README.md - ChangeLog.md +data-dir: data +data-files: +- templates/podcast.mustache + # Metadata used when publishing your package # synopsis: Short description of your package # category: Web @@ -21,12 +25,19 @@ description: Script per la generazione dei Feed Podcast di Ad Alta Voce dependencies: - base >= 4.7 && < 5 +- directory +- text +- time +- scalpel +- mustache +- parsec +- optparse-applicative library: source-dirs: src executables: - ad-alta-voce-exe: + loud: main: Main.hs source-dirs: app ghc-options: diff --git a/src/Command/All.hs b/src/Command/All.hs new file mode 100644 index 0000000..bf6d816 --- /dev/null +++ b/src/Command/All.hs @@ -0,0 +1,46 @@ +{-| +Module : Command.All +Description : Generate podcast for all audiobooks +Copyright : (c) Raffaele Mignone 2021 +License : GPL-3 +Maintainer : git@norangeb.it + +This module exposes the command that generates podcast feeds for all the +audiobooks in Ad Alta Voce library. +-} + +module Command.All(generateAll) where + +import Control.Monad ( join ) +import Data.Maybe ( catMaybes ) +import Text.HTML.Scalpel ( scrapeURL, URL ) +import Command.Single ( single ) +import Scraper.Playlist + ( playlistPageNumbersScraper, playlistsUrlScraper ) + +baseUrl = "https://www.raiplayradio.it" +playlistBaseUrl = "https://www.raiplayradio.it/programmi/adaltavoce/archivio/audiolibri/tutte/" + +scrapeAudiobooksUrl :: IO (Maybe [URL]) +scrapeAudiobooksUrl = do + pageNumbers <- scrapeURL playlistBaseUrl playlistPageNumbersScraper + case scrapePlaylistPages pageNumbers of + Nothing -> return Nothing + Just urls -> Just <$> urls + +scrapePlaylistPages :: Maybe [String] -> Maybe (IO [URL]) +scrapePlaylistPages pageNumbers = do + pageNumbers' <- pageNumbers + let playlistUrls = map (playlistBaseUrl ++) pageNumbers' + audiobookUrls = mapM (`scrapeURL` playlistsUrlScraper) playlistUrls + flatAudiobookUrls = join . catMaybes <$> audiobookUrls + return $ map (baseUrl ++) <$> flatAudiobookUrls + +generateAll :: String -> IO () +generateAll outdir = do + urls <- scrapeAudiobooksUrl + case urls of + Nothing -> putStrLn "Error" + Just urls' -> do + mapM_ (`single` outdir) urls' + putStrLn "All done.\nEnjoy your books!" diff --git a/src/Command/CLI.hs b/src/Command/CLI.hs new file mode 100644 index 0000000..ef2e6f9 --- /dev/null +++ b/src/Command/CLI.hs @@ -0,0 +1,67 @@ +{-| +Module : Command.CLI +Description : Define CLI interface +Copyright : (c) Raffaele Mignone 2021 +License : GPL-3 +Maintainer : git@norangeb.it + +This module exposes the CLI of the program. +-} + +module Command.CLI where + +import Options.Applicative +import Options.Applicative.Builder +import Options.Applicative.Types ( ParserInfo, Parser ) +import Command.All ( generateAll ) +import Command.Single ( single ) + +newtype AllOption = AllOption { outputDirectoryAll :: String} + +data SingleOption = SingleOption { audiobookUrl :: String + , outputDirectorySingle :: String } + +data Command = All AllOption| Single SingleOption + +singleParser :: Parser SingleOption +singleParser = SingleOption + <$> argument str (metavar "URL" + <> help "Audiobook url") + <*> strOption (long "output" + <> short 'o' + <> metavar "DIRECTORY" + <> help "Directory where save the podcast" + <> value "out" + <> showDefault) + +allParser :: Parser AllOption +allParser = AllOption + <$> strOption (long "output" + <> short 'o' + <> metavar "DIRECTORY" + <> help "Directory where save the podcasts" + <> value "out" + <> showDefault) + +singleParserInfo :: ParserInfo Command +singleParserInfo = Single + <$> info (singleParser <**> helper) + (progDesc "Generate podcast of the given Ad Alta Voce url") + +allParserInfo :: ParserInfo Command +allParserInfo = All + <$> info (allParser <**> helper) + (progDesc "Generate podcast for all Ad Alta Voce audioboks") + +commandParser :: Parser Command +commandParser = subparser ( + command "single" singleParserInfo + <> command "all" allParserInfo) + +commandParserInfo :: ParserInfo Command +commandParserInfo = info (commandParser <**> helper) + (progDesc "Generate podcast of Ad Alta Voce audiobook") + +execute :: Command -> IO () +execute (Single opt) = single (audiobookUrl opt) (outputDirectorySingle opt) +execute (All opt) = generateAll $ outputDirectoryAll opt diff --git a/src/Command/Single.hs b/src/Command/Single.hs new file mode 100644 index 0000000..335361a --- /dev/null +++ b/src/Command/Single.hs @@ -0,0 +1,58 @@ +{-| +Module : Command.Single +Description : Generate podcast for an audiobooks +Copyright : (c) Raffaele Mignone 2021 +License : GPL-3 +Maintainer : git@norangeb.it + +This module exposes the command that generates podcast feed for an audiobooks +in Ad Alta Voce library. +-} + +module Command.Single(single) where + +import Data.Text (unpack) +import Data.Time.Clock ( UTCTime(utctDay), getCurrentTime ) +import Text.HTML.Scalpel ( scrapeURL, URL ) +import Text.Mustache +import Text.Parsec.Error ( ParseError ) +import System.Directory ( createDirectoryIfMissing ) +import System.IO +import Paths_ad_alta_voce ( getDataFileName ) +import Scraper.Audiobook ( audiobookScraper ) +import Types + +compilePodcastTemplate :: IO (Either ParseError Template) +compilePodcastTemplate = do + templateDir <- getDataFileName "templates" + automaticCompile [templateDir] templateName + where + templateName = "podcast.mustache" + +scrapeAudiobook :: URL -> IO (Maybe Audiobook) +scrapeAudiobook url = scrapeURL url audiobookScraper + +writePodcastTemplate :: Either ParseError Template -> Maybe Podcast -> String -> IO () +writePodcastTemplate (Left err) _ _ = print err +writePodcastTemplate _ Nothing _ = putStrLn "Error during audiobook parsing" +writePodcastTemplate (Right template) (Just podcast) outdir = do + createDirectoryIfMissing True outdir + withFile fileName WriteMode (\handle -> do + hPutStr handle $ unpack xmlPodcast + putStrLn output) + where + xmlPodcast = substitute template podcast + title = audiobookTitle $ audiobook podcast + fileName = outdir ++ "/" ++ generatePodcastFileName podcast + output = title ++ " done!" + +single :: String -> String -> IO () +single url outdir = do + day <- utctDay <$> getCurrentTime + audiobook <- scrapeAudiobook url + compiled <- compilePodcastTemplate + + let podcast = generatePodcast day url <$> audiobook + + writePodcastTemplate compiled podcast outdir + \ No newline at end of file diff --git a/src/Lib.hs b/src/Lib.hs deleted file mode 100644 index d36ff27..0000000 --- a/src/Lib.hs +++ /dev/null @@ -1,6 +0,0 @@ -module Lib - ( someFunc - ) where - -someFunc :: IO () -someFunc = putStrLn "someFunc" diff --git a/src/Scraper/Audiobook.hs b/src/Scraper/Audiobook.hs new file mode 100644 index 0000000..a00ccbe --- /dev/null +++ b/src/Scraper/Audiobook.hs @@ -0,0 +1,78 @@ +{-| +Module : Scraper.Audiobook +Description : Scrape audiobook information from Rai Play Radio +Copyright : (c) Raffaele Mignone 2021 +License : GPL-3 +Maintainer : git@norangeb.it + +This module contains all selectors and scrapers needed to retrieve the +information from the audiobook. +An example of a web page that can be scraped is available at the following + +-} + +{-# LANGUAGE OverloadedStrings #-} + +module Scraper.Audiobook(audiobookScraper) where + +import Text.HTML.Scalpel +import Types ( Audiobook(Audiobook), Episode(Episode) ) + +audiobookHeaderSelector :: Selector +audiobookHeaderSelector = "div" @: [hasClass "descriptionProgramma"] + +audiobookTitleScraper :: Scraper String String +audiobookTitleScraper = text + $ audiobookHeaderSelector + // "h2" + +audiobookDescriptionScraper :: Scraper String String +audiobookDescriptionScraper = text + $ audiobookHeaderSelector + // "span" @: [hasClass "textDescriptionProgramma"] + +audiobookCoverUrlScraper :: Scraper String String +audiobookCoverUrlScraper = attr "src" + $ "img" @: [hasClass "imgHomeProgramma"] + +episodesSelector :: Selector +episodesSelector = "ol" @: [hasClass "elencoPlaylist"] + // "li" + +episodeTitleScraper :: Scraper String String +episodeTitleScraper = text "h2" + +episodeTrackUrlScraper :: Scraper String String +episodeTrackUrlScraper = attr "data-mediapolis" "li" + +episodeCoverUrlScraper :: Scraper String String +episodeCoverUrlScraper = attr "data-image" "li" + +episodeUrlScraper :: Scraper String String +episodeUrlScraper = attr "data-href" "li" + +episodeDurationScraper :: Scraper String String +episodeDurationScraper = text + $ "span" @: [hasClass "timePlaylist"] + +episodeScraper :: Scraper String Episode +episodeScraper = do + url <- episodeUrlScraper + title <- episodeTitleScraper + trackUrl <- episodeTrackUrlScraper + episodeCover <- episodeCoverUrlScraper + duration <- episodeDurationScraper + return $ Episode url title trackUrl episodeCover duration + +episodesListScraper :: Scraper String [Episode] +episodesListScraper = chroots episodesSelector episodeScraper + +-- |The 'audiobookScraper' function defines the scraper that retrive the +-- 'Audiobook' from the given web page. +audiobookScraper :: Scraper String Audiobook +audiobookScraper = do + title <- audiobookTitleScraper + description <- audiobookDescriptionScraper + coverUrl <- audiobookCoverUrlScraper + episodes <- episodesListScraper + return $ Audiobook title description coverUrl episodes diff --git a/src/Scraper/Playlist.hs b/src/Scraper/Playlist.hs new file mode 100644 index 0000000..8035a34 --- /dev/null +++ b/src/Scraper/Playlist.hs @@ -0,0 +1,45 @@ +{-| +Module : Scraper.Playlist +Description : Scrape audiobook link from Rai Play Radio +Copyright : (c) Raffaele Mignone 2021 +License : GPL-3 +Maintainer : git@norangeb.it + +This module contains all selectors and scrapers needed to retrieve the +links to the audiobooks. +An example of a web page that can be scraped is available at the following + +-} + +{-# LANGUAGE OverloadedStrings #-} + +module Scraper.Playlist + ( playlistsUrlScraper + , playlistPageNumbersScraper + ) where + +import Text.HTML.Scalpel + +playlistSelector :: Selector +playlistSelector = "div" @: [hasClass "bloccoPlaylist"] + +playlistUrlScraper :: Scraper String String +playlistUrlScraper = attr "href" "a" + +-- |The 'playlistUrlScraper' function defines the scraper that retrieves all +-- audiobooks url cointains in the playlist page. +playlistsUrlScraper :: Scraper String [String] +playlistsUrlScraper = chroots playlistSelector playlistUrlScraper + +playlistPageNumberSelector :: Selector +playlistPageNumberSelector = "ul" @: [hasClass "pagination"] + // ("li" @: [notP (hasClass "archivePaginatation")]) + +playlistPageNumberScraper :: Scraper String String +playlistPageNumberScraper = text "a" + +-- |Audiobooks are listed on multiple pages, 'playlistPageNumbersScraper' +-- defines the scraper that retrieves the identifier of all pages that +-- contain some audiobooks. +playlistPageNumbersScraper :: Scraper String [String ] +playlistPageNumbersScraper = chroots playlistPageNumberSelector playlistPageNumberScraper diff --git a/src/Types.hs b/src/Types.hs new file mode 100644 index 0000000..1490386 --- /dev/null +++ b/src/Types.hs @@ -0,0 +1,103 @@ +{-| +Module : Types +Description : Defines data types +Copyright : (c) Raffaele Mignone 2021 +License : GPL-3 +Maintainer : git@norangeb.it + +This module defines the types used by the program and the functions to access +their fields. +-} + +{-# LANGUAGE OverloadedStrings #-} + +module Types + ( Audiobook(Audiobook) + , Episode(Episode) + , Podcast(Podcast) + , generatePodcast + , episodeUrl + , episodeTitle + , episodeTrackUrl + , episodeCoverUrl + , episodeDuration + , audiobookTitle + , audiobookDescription + , audiobookCoverUrl + , audiobookEpisodes + , audiobook + , baseUrl + , pubDay + , generatePodcastFileName + ) where + +import Data.Char ( toLower ) +import Data.Time.Calendar ( Day ) +import Text.Mustache +import Text.Mustache.Types ( Pair ) + +-- | The 'Episode' data type represents an episode of the podcast. +-- 'Episode' is an istance of 'ToMustache' typeclass. +data Episode = Episode { episodeUrl :: String + , episodeTitle :: String + , episodeTrackUrl :: String + , episodeCoverUrl :: String + , episodeDuration :: String +} + deriving (Show) + +-- | The 'Audiobook' data type represents the audiobook of the podcast. +-- 'Audiobook' is an istance of 'ToMustache' typeclass. +data Audiobook = Audiobook { audiobookTitle :: String + , audiobookDescription :: String + , audiobookCoverUrl :: String + , audiobookEpisodes :: [Episode] +} + deriving (Show) + + +-- | The 'Podcast' data type represents the podcast. +-- 'Podcast' is an istance of 'ToMustache' typeclass. +data Podcast = Podcast { audiobook :: Audiobook + , baseUrl :: String + , pubDay :: Day +} + deriving (Show) + +toPairList :: Audiobook -> [Pair] +toPairList audiobook = + [ "audiobook-title" ~> audiobookTitle audiobook + , "audiobook-cover-url" ~> audiobookCoverUrl audiobook + , "audiobook-cover-title" ~> audiobookTitle audiobook + , "audiobook-description" ~> audiobookDescription audiobook + , "episodes" ~> audiobookEpisodes audiobook + ] + +instance ToMustache Episode where + toMustache episode = object + [ "episode-title" ~> episodeTitle episode + , "episode-url" ~> episodeUrl episode + , "episode-track-url" ~> episodeTrackUrl episode + , "episode-duration" ~> episodeDuration episode + ] + +instance ToMustache Audiobook where + toMustache = object . toPairList + +instance ToMustache Podcast where + toMustache podcast = object $ [ + "base-url" ~> baseUrl podcast, + "pub-day" ~> show (pubDay podcast) + ] ++ (toPairList (audiobook podcast)) + +generatePodcast :: Day -> String -> Audiobook -> Podcast +generatePodcast day url audiobook = Podcast audiobook url day + +generatePodcastFileName :: Podcast -> String +generatePodcastFileName (Podcast audiobook _ _) = title'' ++ ".xml" + where + replace :: Eq a => a -> a -> [a] -> [a] + replace a b = map $ \c -> if c == a then b else c + title = audiobookTitle audiobook + title' = map toLower title + title'' = replace ' ' '-' title'