Parse HTML into a typed tree, then search and filter it — zero dependencies.
html-filter is a small Rust library for working with HTML. You give it an HTML string; it gives back a typed recursive tree. You can then walk that tree directly, or use the built-in Filter API to select, exclude, and extract exactly what you need.
This comes in handy for extracting links, headings, metadata, or any specific elements or text from an HTML document. It is also useful to clean-up some html by removing the scripts and style tags, comments, etc.
It is not intended for validating HTML (the parser is lenient by design as page often contain invalid syntax).
cargo add html_filteruse html_filter::*;
let src = r#"
<!DOCTYPE html>
<html lang="en">
<head>
<title>My Page</title>
</head>
<body>
<h1>Hello!</h1>
<p>Welcome to my page.</p>
</body>
</html>
"#;
let tree: Html = Html::parse(src).expect("invalid HTML");Filter uses a builder pattern: start with Filter::new() and chain as many conditions as you need. Call .filter() on a parsed tree to get back an Html containing every node that matched. Here are a few examples:
use html_filter::*;
let src = r#"
<nav>
<a href="/home">Home</a>
<a href="/about">About</a>
<a href="/contact">Contact</a>
</nav>
"#;
let filter = Filter::new().tag_name("a");
let result = Html::parse(src).unwrap().filter(&filter);
// All three <a> tags are collected into an Html::Vec.
if let Html::Vec(links) = result {
assert_eq!(links.len(), 3);
}use html_filter::*;
let src = r#"
<form>
<input type="text" name="username" />
<input type="password" name="pass" />
<input type="submit" value="Login" />
</form>
"#;
// Find only the submit button.
let filter = Filter::new().attribute_value("type", "submit");
let result = Html::parse(src).unwrap().find(&filter);
if let Html::Tag { tag, .. } = result {
assert_eq!(tag.find_attr_value("value").unwrap(), "Login");
}HTML class attributes can contain several class names separated by spaces.
attribute_value_contains matches when the given word appears anywhere in that list.
use html_filter::*;
let src = r#"
<ul>
<li class="item featured">Rust</li>
<li class="item">Go</li>
<li class="item featured">Zig</li>
</ul>
"#;
// Grab only the featured items.
let filter = Filter::new().attribute_value_contains("class", "featured");
let result = Html::parse(src).unwrap().filter(&filter);
if let Html::Vec(items) = result {
assert_eq!(items.len(), 2);
}Every positive selector has a negative counterpart. They can be mixed freely.
use html_filter::*;
let src = r#"
<div>
<p class="visible">Keep me</p>
<p class="hidden">Discard me</p>
<script>alert('also gone')</script>
</div>
"#;
// Keep everything, but strip <script> tags and elements with class "hidden".
let filter = Filter::new()
.except_tag_name("script")
.except_attribute_value_contains("class", "hidden");
let result = Html::parse(src).unwrap().filter(&filter);
// Only the first <p> survives.
if let Html::Tag { tag, .. } = result {
assert_eq!(tag.as_name(), "p");
}find is a shorthand for filter that returns only the first matching node instead of all of them. Use it when you know there is exactly one element you care about.
use html_filter::*;
let src = r##"
<article>
<h1 id="title">Getting Started</h1>
<p>First paragraph.</p>
<p>Second paragraph.</p>
</article>
"##;
// Get the element whose id is "title".
let filter = Filter::new().attribute_value("id", "title");
let heading = Html::parse(src).unwrap().find(&filter);
if let Html::Tag { tag, child, .. } = heading
&& let Html::Text(text) = *child
{
assert_eq!(tag.as_name(), "h1");
assert_eq!(text, "Getting Started");
}By default, filter returns exactly the nodes that matched. Setting depth(n) tells the filter to also keep up to n levels of ancestors around each match. This is very useful when you want to keep a tag based on it's content and not on the tag itself.
use html_filter::*;
let src = r#"<nav><ul>
<li href="first">First</li>
<li href="second">Second</li>
<li href="third">Third</li>
</ul></nav>"#;
// depth(0) — default: return only the matched <li>.
let filter = Filter::new().attribute_value("href", "second");
if let Html::Vec(items) = Html::parse(src).unwrap().filter(&filter) {
if let Html::Tag { tag, .. } = &items[0] {
assert_eq!(tag.as_name(), "li");
}
}
// depth(1): return the <ul> that contains the matched <li>.
let filter = Filter::new().attribute_value("href", "second").depth(1);
if let Html::Tag { tag, .. } = Html::parse(src).unwrap().filter(&filter) {
assert_eq!(tag.as_name(), "ul");
}
// depth(2): return the <nav>.
let filter = Filter::new().attribute_value("href", "second").depth(2);
if let Html::Tag { tag, .. } = Html::parse(src).unwrap().filter(&filter) {
assert_eq!(tag.as_name(), "nav");
}You can strip or keep comments, doctype declarations, and text nodes independently of tag filtering.
use html_filter::*;
let src = r#"<!DOCTYPE html>
<!-- page header -->
<h1>Title</h1>
"#;
// Keep only tags; remove the doctype and comments.
let filter = Filter::new()
.tag_name("h1")
.comment(false)
.doctype(false);
let result = Html::parse(src).unwrap().filter(&filter);
if let Html::Tag { tag, .. } = result {
assert_eq!(tag.as_name(), "h1");
}Convenience methods for common cases:
use html_filter::*;
let src = r#"<!DOCTYPE html><!-- comment --><p>text</p>"#;
// Strip all `<!-- -->` comments.
let r = Html::parse(src).unwrap().filter(&Filter::new().comment(false));
assert_eq!(r.to_string(), "<!DOCTYPE html><p>text</p>");
// Strip all `<!…>` doctype nodes.
let r = Html::parse(src).unwrap().filter(&Filter::new().doctype(false));
assert_eq!(r.to_string(), "<!-- comment --><p>text</p>");
// Strip all bare text nodes.
let r = Html::parse(src).unwrap().filter(&Filter::new().text(false));
assert_eq!(r.to_string(), "<!DOCTYPE html><!-- comment --><p></p>");
// Keep everything except comments.
let r = Html::parse(src).unwrap().filter(&Filter::new().all_except_comment());
assert_eq!(r.to_string(), "<!DOCTYPE html><p>text</p>");
// Keep only text nodes (no comments or doctypes).
let r = Html::parse(src).unwrap().filter(&Filter::new().none_except_text());
assert_eq!(r.to_string(), "<p>text</p>");Once you have an Html::Tag, you can interrogate its Tag and Attributes directly.
use html_filter::*;
let src = r#"<a id="crates" href="https://crates.io" enabled>crates.io</a>"#;
let html = Html::parse(src).unwrap();
if let Html::Tag { tag, child, .. } = html {
// Name of the tag.
assert_eq!(tag.as_name(), "a");
// Read an attribute value by name (returns None for value-less attributes).
assert_eq!(tag.find_attr_value("href").unwrap(), "https://crates.io");
assert!(tag.find_attr_value("enabled").is_none()); // value-less
// Inner text.
if let Html::Text(text) = *child {
assert_eq!(text, "crates.io");
}
}into_attr_value consumes the tag and returns the value as an owned String, useful when you want to move the string out without cloning:
use html_filter::*;
let src = r#"<meta name="description" content="A great page." />"#;
if let Html::Tag { tag, .. } = Html::parse(src).unwrap() {
let content: String = tag.into_attr_value("content").unwrap();
assert_eq!(content, "A great page.");
}All operations have both a consuming variant (takes self) and a borrowing variant (takes &self):
use html_filter::*;
let src = r#"<ul><li>one</li><li>two</li></ul>"#;
let filter = Filter::new().tag_name("li");
let html = Html::parse(src).unwrap();
// Borrowing variants keep the original value.
let filtered = html.to_filtered(&filter);
let first = html.to_found(&filter);
// Consuming variants take ownership.
let filtered = html.clone().filter(&filter);
let first = html.find(&filter);
if let Html::Tag { tag, .. } = first {
assert_eq!(tag.as_name(), "li");
}Licensed under either of
at your option.
Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions.