Extracting Links

reqwest-badge select-badge cat-net-badge

Use reqwest::get to perform a HTTP GET request and then use Document::from_read to parse the response into a HTML document. find with the criteria of Name is "a" retrieves all links. Call filter_map on the Selection retrieves URLs from links that have the "href" attr (attribute).

use error_chain::error_chain; use select::document::Document; use select::predicate::Name; error_chain! { foreign_links { ReqError(reqwest::Error); IoError(std::io::Error); } } #[tokio::main] async fn main() -> Result<()> { let res = reqwest::get("https://www.rust-lang.org/en-US/") .await? .text() .await?; Document::from(res.as_str()) .find(Name("a")) .filter_map(|n| n.attr("href")) .for_each(|x| println!("{}", x)); Ok(()) }

reqwest-badge select-badge url-badge cat-net-badge

Call get_base_url to retrieve the base URL. If the document has a base tag, get the href attr from base tag. Position::BeforePath of the original URL acts as a default.

Iterates through links in the document and creates a tokio::spawn task that will parse an individual link with url::ParseOptions and Url::parse). The task makes a request to the links with reqwest and verifies StatusCode. Then the tasks await completion before ending the program.

use error_chain::error_chain; use reqwest::StatusCode; use select::document::Document; use select::predicate::Name; use std::collections::HashSet; use url::{Position, Url}; error_chain! { foreign_links { ReqError(reqwest::Error); IoError(std::io::Error); UrlParseError(url::ParseError); JoinError(tokio::task::JoinError); } } async fn get_base_url(url: &Url, doc: &Document) -> Result<Url> { let base_tag_href = doc.find(Name("base")).filter_map(|n| n.attr("href")).nth(0); let base_url = base_tag_href.map_or_else(|| Url::parse(&url[..Position::BeforePath]), Url::parse)?; Ok(base_url) } async fn check_link(url: &Url) -> Result<bool> { let res = reqwest::get(url.as_ref()).await?; Ok(res.status() != StatusCode::NOT_FOUND) } #[tokio::main] async fn main() -> Result<()> { let url = Url::parse("https://www.rust-lang.org/en-US/")?; let res = reqwest::get(url.as_ref()).await?.text().await?; let document = Document::from(res.as_str()); let base_url = get_base_url(&url, &document).await?; let base_parser = Url::options().base_url(Some(&base_url)); let links: HashSet<Url> = document .find(Name("a")) .filter_map(|n| n.attr("href")) .filter_map(|link| base_parser.parse(link).ok()) .collect(); let mut tasks = vec![]; for link in links { tasks.push(tokio::spawn(async move { if check_link(&link).await.unwrap() { println!("{} is OK", link); } else { println!("{} is Broken", link); } })); } for task in tasks { task.await? } Ok(()) }

reqwest-badge regex-badge cat-net-badge

Pull the source of a MediaWiki page using reqwest::get and then look for all entries of internal and external links with Regex::captures_iter. Using Cow avoids excessive String allocations.

MediaWiki link syntax is described here.

use lazy_static::lazy_static; use regex::Regex; use std::borrow::Cow; use std::collections::HashSet; use std::error::Error; fn extract_links(content: &str) -> HashSet<Cow<str>> { lazy_static! { static ref WIKI_REGEX: Regex = Regex::new( r"(?x) \[\[(?P<internal>[^\[\]|]*)[^\[\]]*\]\] # internal links | (url=|URL\||\[)(?P<external>http.*?)[ \|}] # external links " ) .unwrap(); } let links: HashSet<_> = WIKI_REGEX .captures_iter(content) .map(|c| match (c.name("internal"), c.name("external")) { (Some(val), None) => Cow::from(val.as_str().to_lowercase()), (None, Some(val)) => Cow::from(val.as_str()), _ => unreachable!(), }) .collect(); links } #[tokio::main] async fn main() -> Result<(), Box<dyn Error>> { let content = reqwest::get( "https://en.wikipedia.org/w/index.php?title=Rust_(programming_language)&action=raw", ) .await? .text() .await?; println!("{:#?}", extract_links(content.as_str())); Ok(()) }