From 2fc81f5a5ea89afdf9ada726a440f6df8b54fd15 Mon Sep 17 00:00:00 2001
From: Leonard Hecker <leonard@hecker.io>
Date: Wed, 25 Mar 2026 20:53:27 +0100
Subject: [PATCH 01/11] Complete the initial LSH implementation

---
 .vscode/launch.json                       |   6 +-
 assets/highlighting-tests/bash.sh         |  71 ++++++++
 assets/highlighting-tests/batch.bat       |  41 +++++
 assets/highlighting-tests/html.html       |  51 ++++++
 assets/highlighting-tests/markdown.md     |  75 ++++++++
 assets/highlighting-tests/powershell.ps1  |  78 +++++++++
 assets/highlighting-tests/properties.conf |  13 ++
 assets/highlighting-tests/xml.xml         |  39 +++++
 assets/highlighting-tests/yaml.yml        |  43 +++++
 crates/edit/benches/lib.rs                |  32 +++-
 crates/edit/src/buffer/mod.rs             |  23 ++-
 crates/lsh/definitions/json.lsh           |  39 +++++
 crates/lsh/definitions/lsh.lsh            |  50 ++++++
 crates/lsh/definitions/markdown.lsh       | 204 ++++++++++++++++++++++
 crates/lsh/definitions/powershell.lsh     |  64 +++++++
 crates/lsh/definitions/properties.lsh     |  55 ++++++
 crates/lsh/definitions/utility.lsh        |  21 +++
 crates/lsh/definitions/xml.lsh            |  78 +++++++++
 crates/lsh/definitions/yaml.lsh           |  78 +++++++++
 19 files changed, 1056 insertions(+), 5 deletions(-)
 create mode 100644 assets/highlighting-tests/bash.sh
 create mode 100644 assets/highlighting-tests/batch.bat
 create mode 100644 assets/highlighting-tests/html.html
 create mode 100644 assets/highlighting-tests/markdown.md
 create mode 100644 assets/highlighting-tests/powershell.ps1
 create mode 100644 assets/highlighting-tests/properties.conf
 create mode 100644 assets/highlighting-tests/xml.xml
 create mode 100644 assets/highlighting-tests/yaml.yml
 create mode 100644 crates/lsh/definitions/json.lsh
 create mode 100644 crates/lsh/definitions/lsh.lsh
 create mode 100644 crates/lsh/definitions/markdown.lsh
 create mode 100644 crates/lsh/definitions/powershell.lsh
 create mode 100644 crates/lsh/definitions/properties.lsh
 create mode 100644 crates/lsh/definitions/utility.lsh
 create mode 100644 crates/lsh/definitions/xml.lsh
 create mode 100644 crates/lsh/definitions/yaml.lsh

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 7142960c4f9d..a62e8d6c3a77 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -10,7 +10,7 @@
             "program": "${workspaceFolder}/target/debug/edit",
             "cwd": "${workspaceFolder}",
             "args": [
-                "${workspaceFolder}/crates/edit/src/bin/edit/main.rs"
+                "${workspaceFolder}/assets/highlighting-tests/markdown.md"
             ],
         },
         {
@@ -23,7 +23,7 @@
             "program": "${workspaceFolder}/target/debug/edit",
             "cwd": "${workspaceFolder}",
             "args": [
-                "${workspaceFolder}/crates/edit/src/bin/edit/main.rs"
+                "${workspaceFolder}/assets/highlighting-tests/markdown.md"
             ],
         },
         {
@@ -40,7 +40,7 @@
             "program": "${workspaceFolder}/target/debug/edit",
             "cwd": "${workspaceFolder}",
             "args": [
-                "${workspaceFolder}/crates/edit/src/bin/edit/main.rs"
+                "${workspaceFolder}/assets/highlighting-tests/markdown.md"
             ],
         },
         {
diff --git a/assets/highlighting-tests/bash.sh b/assets/highlighting-tests/bash.sh
new file mode 100644
index 000000000000..dfd5238724db
--- /dev/null
+++ b/assets/highlighting-tests/bash.sh
@@ -0,0 +1,71 @@
+#!/usr/bin/env bash
+
+# This is a comment
+
+readonly VAR1="Hello"   # String literal
+VAR2=42                 # Integer literal
+VAR3=$((VAR2 + 8))      # Arithmetic expansion
+VAR4=$(echo "World")    # Command substitution
+
+function greet() {      # Function definition
+    local name="$1"     # Local variable, parameter expansion
+    echo "${VAR1}, $name! $VAR4"  # String, parameter expansion, variable
+}
+
+greet "User"            # Function call, string literal
+
+if [[ $VAR2 -gt 40 && $VAR3 -eq 50 ]]; then  # Conditional, test, operators
+    echo "Numbers are correct"   # String literal
+elif (( VAR2 < 40 )); then       # Arithmetic test
+    echo 'VAR2 is less than 40'  # Single-quoted string
+else
+    echo "Other case"
+fi
+
+for i in {1..3}; do     # Brace expansion, for loop
+    echo "Loop $i"      # String, variable
+done
+
+case "$VAR4" in         # Case statement
+    World) echo "It's World";;   # Pattern, string
+    *) echo "Unknown";;          # Wildcard
+esac
+
+arr=(one two three)     # Array
+echo "${arr[1]}"        # Array access
+
+declare -A assoc        # Associative array
+assoc[key]="value"
+echo "${assoc[key]}"
+
+# Here document
+cat <<EOF
+Multi-line
+string with $VAR1
+EOF
+
+# Here string
+grep H <<< "$VAR1"
+
+# Subshell
+(subshell_var=99; echo $subshell_var)
+
+# Redirection
+echo "Redirected" > /dev/null
+
+# Background job
+sleep 1 &
+
+# Arithmetic assignment
+let VAR2+=1
+
+# Process substitution
+diff <(echo foo) <(echo bar)
+
+# Command grouping
+{ echo "Group 1"; echo "Group 2"; }
+
+# Escaped characters
+echo "A quote: \" and a backslash: \\"
+
+# End of file
diff --git a/assets/highlighting-tests/batch.bat b/assets/highlighting-tests/batch.bat
new file mode 100644
index 000000000000..962ef66007af
--- /dev/null
+++ b/assets/highlighting-tests/batch.bat
@@ -0,0 +1,41 @@
+@echo off
+REM --- String, Variable, Label, Command, Operator, Number, Delimiter, Comment ---
+
+:: Label
+:Start
+
+:: Variable assignment and usage
+set "VAR1=Hello"
+set VAR2=World
+
+:: String with spaces and special characters
+set "STR=Batch ^& CMD!"
+
+:: Arithmetic operation (number, operator)
+set /a SUM=5+10
+
+:: IF statement (keyword, operator, string, variable)
+if "%VAR1%"=="Hello" (
+    echo %VAR1%, %VAR2%! %STR%
+) else (
+    echo Not matched!
+)
+
+:: FOR loop (keyword, variable, delimiter, string)
+for %%F in (*.bat) do (
+    echo Found file: %%F
+)
+
+:: CALL command (keyword, label)
+call :SubRoutine
+
+:: GOTO command (keyword, label)
+goto :End
+
+:: Subroutine with parameter
+:SubRoutine
+echo In subroutine with SUM=%SUM%
+goto :eof
+
+:End
+REM End of script
diff --git a/assets/highlighting-tests/html.html b/assets/highlighting-tests/html.html
new file mode 100644
index 000000000000..35682eee1b34
--- /dev/null
+++ b/assets/highlighting-tests/html.html
@@ -0,0 +1,51 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>HTML Syntax Test &amp; Demo</title>
+    <!-- Comment with special chars: <>&"' -->
+    <style>
+        body {
+            margin: 0;
+            padding: 20px;
+        }
+
+        .test {
+            color: #ff0000;
+        }
+    </style>
+</head>
+
+<body>
+    <h1 id="main" class='header' data-value="123">Heading</h1>
+
+    <!-- Various elements -->
+    <p>Text with &lt; &gt; &amp; &quot; &apos; &#65; &#x41; entities</p>
+    <br />
+    <hr>
+    <img src="image.png" alt="Description" />
+
+    <!-- Attributes -->
+    <input type="text" disabled required value="" data-custom="attr">
+    <button onclick="test()">Click</button>
+
+    <!-- Nested elements -->
+    <div>
+        <span>Inline <strong>bold</strong> text</span>
+        <ul>
+            <li>Item 1</li>
+            <li>Item 2</li>
+        </ul>
+    </div>
+
+    <!-- Script -->
+    <script>
+        function test() {
+            return x < 10 && y > 5;
+        }
+    </script>
+</body>
+
+</html>
diff --git a/assets/highlighting-tests/markdown.md b/assets/highlighting-tests/markdown.md
new file mode 100644
index 000000000000..c4845eb68014
--- /dev/null
+++ b/assets/highlighting-tests/markdown.md
@@ -0,0 +1,75 @@
+# H1
+
+## H2
+
+### H3
+
+#### H4
+
+##### H5
+
+###### H6
+
+regular
+*italic*
+_italic_
+**bold**
+__bold__
+***bold italic***
+**_bold italic_**
+__*bold italic*__
+~~strikethrough~~
+`inline code`
+`` `literal` ``
+\*not\* \_italic\_ # not a heading
+
+* Unordered item
+  - Nested item
+    * Third level
+* Task list:
+  * [ ] To do
+  * [x] Done
+  * [ ] *Mixed* **formatting** with `code`
+1. Ordered can start anywhere
+2. …like here (intentional)
+   1. Nested ordered
+   2. Multiple paragraphs within a list item:
+      Still the same item.
+
+> A single-level quote
+>
+> > A nested quote with **bold** and `code`
+>
+> * List in a quote
+> * [Link in quote](#links)
+
+Inline: [Example](https://example.com "Example Title")
+Reference: [Ref Link][ref] and [Another][another-ref]
+Relative: [This section](#tables)
+Footnote: [^note]
+[ref]: https://example.com
+[another-ref]: https://github.com
+[^note]: This is a footnote with **formatting** and a [link](https://github.com).
+
+Inline: ![Alt text](https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png "GitHub Mark")
+Reference: ![Logo][logo-ref]
+[logo-ref]: https://github.githubassets.com/images/modules/logos_page/GitHub-Logo.png "GitHub Logo"
+
+| Left        |   Center   | Right |
+| :---------- | :--------: | ----: |
+| *italic*    |   `code`   |   123 |
+| **bold**    | ~~strike~~ |  4.56 |
+| [link][ref] |   :tada:   | `end` |
+
+```bash
+# Shell
+echo "Hello, world" | tr a-z A-Z
+```
+
+```json
+{
+  "name": "gfm-kitchen-sink",
+  "private": true,
+  "scripts": { "test": "echo ok" }
+}
+```
diff --git a/assets/highlighting-tests/powershell.ps1 b/assets/highlighting-tests/powershell.ps1
new file mode 100644
index 000000000000..e2bb01f39c6c
--- /dev/null
+++ b/assets/highlighting-tests/powershell.ps1
@@ -0,0 +1,78 @@
+# Single-line comment
+
+<#
+Multi-line
+comment
+#>
+
+function Get-SampleData {
+    param(
+        [string]$Name = "World", # String literal, parameter
+        [int]$Count = 3
+    )
+
+    $array = @(1, 2, 3) # Array literal
+    $hashtable = @{ Key1 = 'Value1'; Key2 = 42 } # Hashtable literal
+
+    $nullVar = $null
+    $boolTrue = $true
+    $boolFalse = $false
+
+    $regexMatch = "abc123" -match '\d+' # Regex literal
+
+    for ($i = 0; $i -lt $Count; $i++) {
+        Write-Host "Hello, $Name! Iteration: $i" # Variable interpolation, string
+    }
+
+    if ($hashtable.Key2 -eq 42) {
+        Write-Output "Hashtable value is 42"
+    }
+    elseif ($hashtable.Key2 -gt 40) {
+        Write-Output "Hashtable value is greater than 40"
+    }
+    else {
+        Write-Output "Hashtable value is less than or equal to 40"
+    }
+
+    switch ($Name) {
+        "World" { Write-Host "Default name used." }
+        default { Write-Host "Custom name: $Name" }
+    }
+
+    try {
+        throw "An error occurred"
+    }
+    catch {
+        Write-Warning $_
+    }
+    finally {
+        Write-Verbose "Finally block executed"
+    }
+
+    $script:globalVar = 99 # Scope modifier
+
+    # Here-String
+    $hereString = @"
+This is a here-string.
+Name: $Name
+"@
+
+    return $hereString
+}
+
+# Command invocation, pipeline, splatting
+$paramSplat = @{
+    Name  = 'PowerShell'
+    Count = 2
+}
+Get-SampleData @paramSplat | Out-File -FilePath "./output.txt"
+
+# Type literal, member access, method call
+[System.DateTime]::Now.ToString("yyyy-MM-dd")
+
+# Subexpression
+Write-Host "2 + 2 = $($array[0] + $array[1])"
+
+# Command substitution
+$pwdPath = $(Get-Location).Path
+Write-Host "Current directory: $pwdPath"
diff --git a/assets/highlighting-tests/properties.conf b/assets/highlighting-tests/properties.conf
new file mode 100644
index 000000000000..3bd3b62693ba
--- /dev/null
+++ b/assets/highlighting-tests/properties.conf
@@ -0,0 +1,13 @@
+# General Settings
+[General]
+enabled = true
+debug = false
+log_level = info
+max_connections = 1000
+
+[SSL]
+enabled = true
+cert_file = /etc/ssl/certs/server.crt
+key_file = /etc/ssl/private/server.key
+protocols = TLSv1.2, TLSv1.3 # Supported protocols: "TLSv1.2" and "TLSv1.3"
+cipher_suite = "ECDHE-RSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-GCM-SHA256"
diff --git a/assets/highlighting-tests/xml.xml b/assets/highlighting-tests/xml.xml
new file mode 100644
index 000000000000..ad2c2aa09a0d
--- /dev/null
+++ b/assets/highlighting-tests/xml.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<!DOCTYPE root [
+  <!ENTITY custom "Custom Entity">
+]>
+<!-- Root comment with special chars: <>&"' -->
+<?processing-instruction target="value"?>
+<root xmlns="http://default.namespace" xmlns:ns="http://custom.namespace">
+  <!-- Elements and attributes -->
+  <element attribute="value" ns:attr='single-quoted' empty-attr="">
+    <self-closing id="123" />
+    <nested>Text &amp; &lt; &gt; &quot; &apos; content</nested>
+    <entity>&custom;</entity>
+  </element>
+
+  <!-- CDATA section -->
+  <script><![CDATA[
+    function test() {
+      return x < 10 && y > 5;
+    }
+  ]]></script>
+
+  <!-- Mixed content -->
+  <mixed>Text before <tag>inline</tag> text after</mixed>
+
+  <!-- Numeric character references -->
+  <chars>&#65; &#x41; &#x1F600;</chars>
+
+  <!-- Edge cases -->
+  <edge-cases>
+    <empty></empty>
+    <whitespace>   </whitespace>
+    <special-chars>!@#$%^*()_+-={}[]|\:;"'&lt;&gt;,.</special-chars>
+  </edge-cases>
+
+  <!-- Nested namespaces -->
+  <ns:custom ns:id="namespaced">
+    <ns:child />
+  </ns:custom>
+</root>
diff --git a/assets/highlighting-tests/yaml.yml b/assets/highlighting-tests/yaml.yml
new file mode 100644
index 000000000000..6dbf0abada91
--- /dev/null
+++ b/assets/highlighting-tests/yaml.yml
@@ -0,0 +1,43 @@
+# This is a comment
+---
+string: "Hello, world!"
+plain: plainValue
+multiline: |
+  This is a
+  true
+  multiline string.
+folded: >
+  This is a
+  true
+  folded string.
+number_int: 42
+number_float: 3.1415
+number_scientific: 1.23e45
+number_negative: -7
+boolean_true: true
+boolean_false: false
+null_value: null
+explicit_null: ~
+date: 2024-06-01
+timestamp: 2024-06-01T12:34:56Z
+confusable_string_number: 1.23e45 1.23e45 # This is a comment
+sequence:
+  - item1
+  - item2
+  - 3
+  - true
+mapping:
+  key1: value1
+  key2: value2
+nested:
+  - name: Alice
+    age: 30
+    married: false
+  - name: Bob
+    age: 25
+    married: true
+empty_sequence: [foo, 123, bar]
+empty_mapping: { foo: bar }
+literal_colon: "value:with:colons"
+literal_dash: "-not-a-sequence"
+special_chars: "Tab:\t Newline:\n Unicode:\u2713"
diff --git a/crates/edit/benches/lib.rs b/crates/edit/benches/lib.rs
index 45c4d52793bf..372e8ebaebac 100644
--- a/crates/edit/benches/lib.rs
+++ b/crates/edit/benches/lib.rs
@@ -3,11 +3,12 @@
 
 use std::hint::black_box;
 use std::io::Cursor;
+use std::path::Path;
 use std::{mem, vec};
 
 use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
 use edit::helpers::*;
-use edit::{buffer, hash, json, oklab, simd, unicode};
+use edit::{buffer, hash, json, lsh, oklab, simd, unicode};
 use stdext::arena::{self, scratch_arena};
 use stdext::collections::BVec;
 use stdext::float::parse_f64_approx;
@@ -187,6 +188,34 @@ fn bench_json(c: &mut Criterion) {
     );
 }
 
+fn bench_lsh(c: &mut Criterion) {
+    let bytes = include_bytes!("../../../assets/highlighting-tests/markdown.md");
+    let bytes = &bytes[..];
+    let lang = lsh::LANGUAGES.iter().find(|lang| lang.id == "markdown").unwrap();
+    let highlighter = lsh::Highlighter::new(black_box(&bytes), lang);
+
+    c.benchmark_group("lsh").throughput(Throughput::Bytes(bytes.len() as u64)).bench_function(
+        "markdown",
+        |b| {
+            b.iter(|| {
+                let mut h = highlighter.clone();
+                loop {
+                    let scratch = scratch_arena(None);
+                    let res = h.parse_next_line(&scratch);
+                    if res.is_empty() {
+                        break;
+                    }
+                }
+            })
+        },
+    );
+
+    c.benchmark_group("lsh").bench_function("process_file_associations", |b| {
+        let path = Path::new("/some/long/path/to/file/foo.bar.foo.bar.foo.bar");
+        b.iter(|| lsh::process_file_associations(lsh::FILE_ASSOCIATIONS, black_box(path)))
+    });
+}
+
 fn bench_oklab(c: &mut Criterion) {
     c.benchmark_group("oklab")
         .bench_function("StraightRgba::as_oklab", |b| {
@@ -294,6 +323,7 @@ fn bench(c: &mut Criterion) {
     bench_glob(c);
     bench_hash(c);
     bench_json(c);
+    bench_lsh(c);
     bench_oklab(c);
     bench_simd_lines_fwd(c);
     bench_simd_memchr2(c);
diff --git a/crates/edit/src/buffer/mod.rs b/crates/edit/src/buffer/mod.rs
index 1e2b46110013..4ec01f103a7f 100644
--- a/crates/edit/src/buffer/mod.rs
+++ b/crates/edit/src/buffer/mod.rs
@@ -42,7 +42,7 @@ use stdext::{ReplaceRange as _, arena_write_fmt, minmax, slice_as_uninit_mut, sl
 use crate::cell::SemiRefCell;
 use crate::clipboard::Clipboard;
 use crate::document::{ReadableDocument, WriteableDocument};
-use crate::framebuffer::{Framebuffer, IndexedColor};
+use crate::framebuffer::{Attributes, Framebuffer, IndexedColor};
 use crate::helpers::*;
 use crate::lsh::cache::HighlighterCache;
 use crate::lsh::{HighlightKind, Highlighter, Language};
@@ -2133,13 +2133,31 @@ impl TextBuffer {
                 let color = match curr.kind {
                     HighlightKind::Other => None,
                     HighlightKind::Comment => Some(IndexedColor::Green),
+                    HighlightKind::Method => Some(IndexedColor::BrightYellow),
+                    HighlightKind::String => Some(IndexedColor::BrightRed),
+                    HighlightKind::Variable => Some(IndexedColor::BrightCyan),
+                    HighlightKind::ConstantLanguage => Some(IndexedColor::BrightBlue),
                     HighlightKind::ConstantNumeric => Some(IndexedColor::BrightGreen),
                     HighlightKind::KeywordControl => Some(IndexedColor::BrightMagenta),
+                    HighlightKind::KeywordOther => Some(IndexedColor::BrightBlue),
+                    HighlightKind::MarkupBold => None,
                     HighlightKind::MarkupChanged => Some(IndexedColor::BrightBlue),
                     HighlightKind::MarkupDeleted => Some(IndexedColor::BrightRed),
+                    HighlightKind::MarkupHeading => Some(IndexedColor::BrightBlue),
                     HighlightKind::MarkupInserted => Some(IndexedColor::BrightGreen),
+                    HighlightKind::MarkupItalic => None,
+                    HighlightKind::MarkupLink => None,
+                    HighlightKind::MarkupList => Some(IndexedColor::BrightBlue),
+                    HighlightKind::MarkupStrikethrough => None,
                     HighlightKind::MetaHeader => Some(IndexedColor::BrightBlue),
                 };
+                let attr = match curr.kind {
+                    HighlightKind::MarkupBold => Some(Attributes::Bold),
+                    HighlightKind::MarkupItalic => Some(Attributes::Italic),
+                    HighlightKind::MarkupLink => Some(Attributes::Underlined),
+                    HighlightKind::MarkupStrikethrough => Some(Attributes::Strikethrough),
+                    _ => None,
+                };
 
                 // Handle the case where the highlight spans multiple visual lines
                 // due to word wrapping. The range is [beg, end) in terms of offsets,
@@ -2201,6 +2219,9 @@ impl TextBuffer {
                     if let Some(color) = color {
                         fb.blend_fg(rect, fb.indexed(color));
                     }
+                    if let Some(attr) = attr {
+                        fb.replace_attr(rect, Attributes::All, attr);
+                    }
                 }
             }
         }
diff --git a/crates/lsh/definitions/json.lsh b/crates/lsh/definitions/json.lsh
new file mode 100644
index 000000000000..6b8f296b441b
--- /dev/null
+++ b/crates/lsh/definitions/json.lsh
@@ -0,0 +1,39 @@
+#[display_name = "JSON"]
+#[path = "**/*.json"]
+#[path = "**/*.jsonc"]
+pub fn json() {
+    until /$/ {
+        yield other;
+
+        if /\/\/.*/ {
+            yield comment;
+        } else if /\/\*/ {
+            loop {
+                yield comment;
+                await input;
+                if /\*\// {
+                    yield comment;
+                    break;
+                }
+            }
+        } else if /"/ {
+            double_quote_string();
+        } else if /true|false|null/ {
+            if /\w+/ {
+                // Not a keyword after all.
+            } else {
+                yield constant.language;
+            }
+        } else if /-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?/ {
+            if /\w+/ {
+                // Not a number after all.
+            } else {
+                yield constant.numeric;
+            }
+        } else if /\w+/ {
+            // Invalid token? Skip.
+        }
+
+        yield other;
+    }
+}
diff --git a/crates/lsh/definitions/lsh.lsh b/crates/lsh/definitions/lsh.lsh
new file mode 100644
index 000000000000..e9cf958916c8
--- /dev/null
+++ b/crates/lsh/definitions/lsh.lsh
@@ -0,0 +1,50 @@
+#[display_name = "LSH"]
+#[path = "**/*.lsh"]
+pub fn lsh() {
+    until /$/ {
+        yield other;
+
+        if /\/\/.*/ {
+            yield comment;
+        } else if /pub|fn|await/ {
+            if /\w+/ {
+                yield other;
+            } else {
+                yield keyword.other;
+            }
+        } else if /if|else|until|loop|break|continue/ {
+            if /\w+/ {
+                yield other;
+            } else {
+                yield keyword.control;
+            }
+        } else if /yield/ {
+            if /\w+/ {
+                yield other;
+            } else {
+                yield keyword.other;
+                if /\s+/ {
+                    // Gobble space to the argument
+                    yield other;
+                }
+                if /[\w.]+/ {
+                    // The yield argument
+                    yield markup.link;
+                }
+            }
+        } else if /\// {
+            until /$/ {
+                if /\\./ {
+                    // Skip escape char
+                } else if /\// {
+                    yield string;
+                    break;
+                }
+            }
+        } else if /"/ {
+            double_quote_string();
+        }
+
+        yield other;
+    }
+}
diff --git a/crates/lsh/definitions/markdown.lsh b/crates/lsh/definitions/markdown.lsh
new file mode 100644
index 000000000000..f5444d32f94d
--- /dev/null
+++ b/crates/lsh/definitions/markdown.lsh
@@ -0,0 +1,204 @@
+#[display_name = "Markdown"]
+#[path = "**/*.md"]
+pub fn markdown() {
+    // Gobble any leading whitespace on the line.
+    if /\s+/ {
+        yield other;
+    }
+
+    if /#+\s+.*/ {
+        yield markup.heading;
+    } else if />.*/ {
+        yield comment;
+    } else if /```/ {
+        // NOTE: These checks are sorted alphabetically.
+        if /(?i:diff)/ {
+            loop {
+                await input;
+                if /\s*```/ {
+                    return;
+                } else {
+                    diff();
+                    // diff() may not have eaten that line.
+                    // It doesn't have to. That's our responsibility.
+                    if /.*/ {}
+                }
+            }
+        } else if /(?i:json)/ {
+            loop {
+                await input;
+                if /\s*```/ {
+                    return;
+                } else {
+                    json();
+                    if /.*/ {}
+                }
+            }
+        } else if /(?i:yaml)/ {
+            loop {
+                await input;
+                if /\s*```/ {
+                    return;
+                } else {
+                    yaml();
+                    if /.*/ {}
+                }
+            }
+        } else if /(?i:pwsh|powershell)/ {
+            loop {
+                await input;
+                if /\s*```/ {
+                    return;
+                } else {
+                    powershell();
+                    if /.*/ {}
+                }
+            }
+        } else {
+            loop {
+                await input;
+                if /\s*```.*/ {
+                    return;
+                } else if /.*/ {
+                    // Gobble the rest of the line with no highlighting.
+                }
+            }
+        }
+    } else {
+        if /[\*-]\s+/ {
+            yield markup.list;
+        }
+
+        // The structure of inline emphasis, etc., matchers is
+        // awfully wrong, but parsing Markdown is also awful in general.
+        // In fact I think it's categorically impossible to implement like this,
+        // because it requires lookahead across potentially infinite lines:
+        // The Markdown spec describes its "parsing strategy" as "two phase"
+        // where the first phase segments the document into block structures.
+        // For instance, a `comment` may span multiple lines, _unless_
+        // there's a paragraph break inbetween, e.g. due to a ">" comment.
+        until /$/ {
+            yield other;
+
+            if /\[/ {
+                // Anything inside [...] is a link text and is colored like a string.
+                yield other;
+                if /[^\]]*/ {
+                    yield string;
+
+                    // Now look for the link target in parentheses.
+                    if /\]\(/ {
+                        yield other;
+                        if /[^\s\)]+/ {
+                            yield markup.link;
+                        }
+
+                        if /\s+/ {
+                        }
+
+                        // The link may have a title in quotes.
+                        yield other;
+                        if /"/ {
+                            if /[^"]+/ {
+                            }
+                            if /"/ {
+                                yield string;
+                            }
+                        }
+                    }
+
+                }
+            } else if /__/ {
+                if /_+/ {
+                    // "_____" = no hit
+                } else if /\s+/ {
+                    // "__ foo __" = no hit
+                } else {
+                    until /$/ {
+                        if /\\./ {
+                            // gobble escape char
+                        } else if /[^\s_]+__/ {
+                            yield markup.bold;
+                            break;
+                        } else if /__/ {
+                            break;
+                        }
+                    }
+                }
+            } else if /\*\*/ {
+                if /\s+/ {
+                    // "** foo **" = no hit
+                } else {
+                    until /$/ {
+                        if /\\./ {
+                            // gobble escape char
+                        } else if /[^\s*]+\*\*/ {
+                            yield markup.bold;
+                            break;
+                        } else if /\*\*/ {
+                            break;
+                        }
+                    }
+                }
+            } else if /_/ {
+                if /\s+/ {
+                    // "_ foo _" = no hit
+                } else {
+                    until /$/ {
+                        if /\\./ {
+                            // gobble escape char
+                        } else if /[^\s_]+_/ {
+                            yield markup.italic;
+                            break;
+                        } else if /_/ {
+                            break;
+                        }
+                    }
+                }
+            } else if /\*/ {
+                if /\s+/ {
+                    // "* foo *" = no hit
+                } else {
+                    until /$/ {
+                        if /\\./ {
+                            // gobble escape char
+                        } else if /[^\s*]+\*/ {
+                            yield markup.italic;
+                            break;
+                        } else if /\*/ {
+                            break;
+                        }
+                    }
+                }
+            } else if /~~/ {
+                if /\s+/ {
+                    // "~~ foo ~~" = no hit
+                } else {
+                    until /$/ {
+                        if /\\./ {
+                            // gobble escape char
+                        } else if /[^\s~]+~~/ {
+                            yield markup.strikethrough;
+                            break;
+                        } else if /~~/ {
+                            break;
+                        }
+                    }
+                }
+            } else if /```/ {
+                until /$/ { if /```/ { break; } }
+                yield string;
+            } else if /``/ {
+                until /$/ { if /``/ { break; } }
+                yield string;
+            } else if /`/ {
+                until /$/ { if /`/ { break; } }
+                yield string;
+            } else if /\\./ {
+                // Gobble escape char
+            }
+
+            yield other;
+        }
+    }
+}
diff --git a/crates/lsh/definitions/powershell.lsh b/crates/lsh/definitions/powershell.lsh
new file mode 100644
index 000000000000..ce4bf5e45747
--- /dev/null
+++ b/crates/lsh/definitions/powershell.lsh
@@ -0,0 +1,64 @@
+#[display_name = "PowerShell"]
+#[path = "**/*.ps1"]
+#[path = "**/*.psd1"]
+#[path = "**/*.psm1"]
+pub fn powershell() {
+    until /$/ {
+        yield other;
+
+        if /#.*/ {
+            yield comment;
+        } else if /<#/ {
+            loop {
+                yield comment;
+                if /#>/ { yield comment; break; }
+                await input;
+            }
+        } else if /'/ {
+            loop {
+                yield string;
+                if /\\./ {}
+                else if /'/ { yield string; break; }
+                await input;
+            }
+        } else if /@"/ {
+            loop {
+                yield string;
+                if /\\./ {}
+                else if /"@/ { yield string; break; }
+                await input;
+            }
+        } else if /"/ {
+            loop {
+                yield string;
+                if /\\./ {}
+                else if /"/ { yield string; break; }
+                await input;
+            }
+        } else if /function|param/ {
+            yield keyword.other;
+        } else if /elseif|else|if|for|switch|default|throw|try|catch|finally/ {
+            yield keyword.control;
+        } else if /-?(?:\d+\.?\d*|\.\d+)(?:[eE][+-]?\d+)?/ {
+            if /[\w\-]+/ {
+                yield method;
+            } else {
+                yield constant.numeric;
+            }
+        } else if /\$false|\$true|\$null/ {
+            if /\w+/ {
+                yield variable;
+            } else {
+                yield constant.language;
+            }
+        } else if /[$@][\w\d?:]+/ {
+            yield variable;
+        } else if /[\w\d?:]+-[\w\d?:-]*/ {
+            yield method;
+        } else if /[\w\d?:]+/ {
+            // Gobble any other tokens that should not be highlighted
+        }
+
+        yield other;
+    }
+}
diff --git a/crates/lsh/definitions/properties.lsh b/crates/lsh/definitions/properties.lsh
new file mode 100644
index 000000000000..fe109b406dd1
--- /dev/null
+++ b/crates/lsh/definitions/properties.lsh
@@ -0,0 +1,55 @@
+#[display_name = "Properties"]
+// ini
+#[path = "**/*.ini"]
+// git
+#[path = "**/.config/git/config"]
+#[path = "**/.git/config"]
+#[path = "**/gitconfig"]
+#[path = "**/*.gitattributes"]
+#[path = "**/*.gitconfig"]
+#[path = "**/*.gitmodules"]
+// things that VS Code calls "properties" files
+#[path = "**/*.cfg"]
+#[path = "**/*.conf"]
+#[path = "**/*.directory"]
+#[path = "**/*.editorconfig"]
+#[path = "**/*.properties"]
+#[path = "**/*.repo"]
+// systemd
+#[path = "**/*.automount"]
+#[path = "**/*.dnssd"]
+#[path = "**/*.link"]
+#[path = "**/*.mount"]
+#[path = "**/*.netdev"]
+#[path = "**/*.network"]
+#[path = "**/*.nspawn"]
+#[path = "**/*.path"]
+#[path = "**/*.service"]
+#[path = "**/*.slice"]
+#[path = "**/*.socket"]
+#[path = "**/*.swap"]
+#[path = "**/*.systemd"]
+#[path = "**/*.target"]
+#[path = "**/*.timer"]
+pub fn properties() {
+    if /\[[^\[]+\]/ {
+        yield meta.header;
+    }
+
+    until /$/ {
+        yield other;
+
+        if /[;#].*/ {
+            yield comment;
+        } else if /([\w.-]+)\s*=\s*[\w.-]*/ {
+            yield $1 as variable;
+            yield other;
+        } else if /'/ {
+            single_quote_string();
+        } else if /"/ {
+            double_quote_string();
+        }
+
+        yield other;
+    }
+}
diff --git a/crates/lsh/definitions/utility.lsh b/crates/lsh/definitions/utility.lsh
new file mode 100644
index 000000000000..d8f6be7ed76d
--- /dev/null
+++ b/crates/lsh/definitions/utility.lsh
@@ -0,0 +1,21 @@
+fn single_quote_string() {
+    until /$/ {
+        if /\\./ {
+            // Escape sequences
+        } else if /'/ {
+            yield string;
+            break;
+        }
+    }
+}
+
+fn double_quote_string() {
+    until /$/ {
+        if /\\./ {
+            // Escape sequences
+        } else if /"/ {
+            yield string;
+            break;
+        }
+    }
+}
diff --git a/crates/lsh/definitions/xml.lsh b/crates/lsh/definitions/xml.lsh
new file mode 100644
index 000000000000..988273c91654
--- /dev/null
+++ b/crates/lsh/definitions/xml.lsh
@@ -0,0 +1,78 @@
+#[display_name = "XML"]
+#[path = "**/*.csproj.user"]
+#[path = "**/*.csproj"]
+#[path = "**/*.html"]
+#[path = "**/*.nuspec"]
+#[path = "**/*.proj"]
+#[path = "**/*.props"]
+#[path = "**/*.rss"]
+#[path = "**/*.slnx"]
+#[path = "**/*.svg"]
+#[path = "**/*.targets"]
+#[path = "**/*.vcxproj.filters"]
+#[path = "**/*.vcxproj"]
+#[path = "**/*.xaml"]
+#[path = "**/*.xml"]
+#[path = "**/*.xml"]
+pub fn xml() {
+    until /$/ {
+        yield other;
+
+        if /<!--/ {
+            loop {
+                yield comment;
+                await input;
+                if /-->/ {
+                    yield comment;
+                    break;
+                }
+            }
+        } else if /<!\[CDATA\[/ {
+            loop {
+                yield string;
+                await input;
+                if /\]\]>/ {
+                    yield string;
+                    break;
+                }
+            }
+        } else if /<!(DOCTYPE)[^>]*>/ {
+            yield $1 as constant.language;
+            yield other;
+        } else if /(?:<\?|<)([\w:.-]+)/ {
+            yield $1 as constant.language;
+            yield other;
+
+            until />/ {
+                yield other;
+                await input;
+
+                if /([\w:.-]+)\s*=/ {
+                    yield $1 as variable;
+                    yield other;
+                } else if /"/ {
+                    until /"/ {
+                        yield string;
+                        await input;
+                    }
+                    yield string;
+                } else if /'/ {
+                    until /'/ {
+                        yield string;
+                        await input;
+                    }
+                    yield string;
+                }
+            }
+
+            yield other;
+        } else if /<\/([\w:.-]+)\s*>/ {
+            yield $1 as constant.language;
+            yield other;
+        } else if /(?:&#|&)[\w:.-]+;/ {
+            yield constant.numeric;
+        }
+
+        yield other;
+    }
+}
diff --git a/crates/lsh/definitions/yaml.lsh b/crates/lsh/definitions/yaml.lsh
new file mode 100644
index 000000000000..a090447e3dd5
--- /dev/null
+++ b/crates/lsh/definitions/yaml.lsh
@@ -0,0 +1,78 @@
+#[display_name = "YAML"]
+#[path = "**/*.yaml"]
+#[path = "**/*.yml"]
+pub fn yaml() {
+    if /\s+/ {
+        // Leading whitespace
+    }
+
+    var indentation = off;
+
+    loop {
+        if /-\s+/ {
+            // List item
+        }
+        if /\w+:(?:\s+|$)/ {
+            // Key: (Value follows)
+        }
+
+        if /[|>][-+]?.*/ {
+            // If we see a multiline string inducer, capture content until indentation is back to before.
+            // Since we'll skip multiple lines, we need to use nested loops here.
+
+            loop {
+                // Wait for the next line in this multiline string.
+                await input;
+
+                // Skip leading whitespace
+                if /\s+/ {
+                }
+
+                if off <= indentation {
+                    // Once the new indentation is smaller, the multiline string has ended.
+                    break;
+                }
+
+                if /.*/ {
+                    yield string;
+                }
+            }
+        } else {
+            break;
+        }
+    }
+
+    yield other;
+
+    if /---/ {
+        yield other;
+    } else if /true|false|yes|no|null|~/ {
+        // If it looks like a keyword, check if it's actually a string (e.g. "foo: true false").
+        // TODO: Ideally this would not consume the last chunk of /\s+/ before a /# comment/.
+        if /\s*[^\s#]+[^#]*/ {
+            yield string;
+        } else {
+            yield constant.language;
+        }
+    } else if /-?(?:\d+\.?\d*|\.\d+)(?:[eE][+-]?\d+)?/ {
+        // Same as with keywords above.
+        if /\s*[^\s#]+[^#]*/ {
+            yield string;
+        } else {
+            yield constant.numeric;
+        }
+    } else if /[^\s#]+[^#]*/ {
+        // Same as before: Gobble anything that looks like a string.
+        // TODO: This could be simplified & combined with the above if we stored the
+        // highlight kind into a variable and then yield the variable value itself.
+        yield string;
+    }
+
+    if /\s+/ {
+        yield other;
+    }
+
+    if /#.*/ {
+        yield comment;
+    }
+}

From 6d52672ddc44d65e8532f87ce875eeeb275f93e3 Mon Sep 17 00:00:00 2001
From: Leonard Hecker <leonard@hecker.io>
Date: Thu, 26 Mar 2026 15:25:37 +0100
Subject: [PATCH 02/11] Fix Charset::set_range

---
 crates/lsh/src/compiler/charset.rs | 32 +++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/crates/lsh/src/compiler/charset.rs b/crates/lsh/src/compiler/charset.rs
index 32d1e99653ee..ca772904b8b9 100644
--- a/crates/lsh/src/compiler/charset.rs
+++ b/crates/lsh/src/compiler/charset.rs
@@ -62,6 +62,11 @@ impl Charset {
         let end_word = end / WORD_BITS;
         let end_bit = end % WORD_BITS;
 
+        // Save the bits above end_bit in the last word before we overwrite it
+        let shift = end_bit + 1;
+        let tail_mask = if shift < WORD_BITS { usize::MAX << shift } else { 0 };
+        let saved_tail = self.bits[end_word] & tail_mask;
+
         // Write the starting bits of the first word
         let mask = usize::MAX << start_bit;
         self.bits[start_word] =
@@ -74,11 +79,8 @@ impl Charset {
             word += 1;
         }
 
-        // Fix the trailing bits of the last word we wrote above
-        let shift = end_bit + 1;
-        let mask = if shift < WORD_BITS { usize::MAX << shift } else { 0 };
-        self.bits[end_word] =
-            if value { self.bits[end_word] & !mask } else { self.bits[end_word] | mask };
+        // Restore the trailing bits of the last word
+        self.bits[end_word] = (self.bits[end_word] & !tail_mask) | saved_tail;
     }
 
     pub fn merge(&mut self, other: &Charset) {
@@ -198,4 +200,24 @@ mod tests {
             }
         }
     }
+
+    #[test]
+    fn set_range_overlapping() {
+        // Two single-bit ranges in the same word must not clobber each other.
+        let mut cs = Charset::no();
+        cs.set_range(b'e'..=b'e', true);
+        cs.set_range(b'E'..=b'E', true);
+        for i in 0u8..=255 {
+            assert_eq!(cs.get(i), i == b'e' || i == b'E', "bit {i}");
+        }
+
+        // A wide range must not destroy bits set earlier outside it.
+        let mut cs = Charset::no();
+        cs.set_range(0..=0, true);
+        cs.set_range(255..=255, true);
+        cs.set_range(10..=245, true);
+        for i in 0u8..=255 {
+            assert_eq!(cs.get(i), i == 0 || (10..=245).contains(&i) || i == 255, "bit {i}");
+        }
+    }
 }

From 7474c099bd1222b916c52237f76cc876e94426f1 Mon Sep 17 00:00:00 2001
From: Leonard Hecker <leonard@hecker.io>
Date: Thu, 26 Mar 2026 15:25:52 +0100
Subject: [PATCH 03/11] Clean up atom/literal parsing, Add \t support

---
 crates/lsh/src/compiler/regex.rs | 281 +++++++++++++++----------------
 1 file changed, 132 insertions(+), 149 deletions(-)

diff --git a/crates/lsh/src/compiler/regex.rs b/crates/lsh/src/compiler/regex.rs
index 1862ce929706..e4d3c4696253 100644
--- a/crates/lsh/src/compiler/regex.rs
+++ b/crates/lsh/src/compiler/regex.rs
@@ -109,6 +109,14 @@ enum Regex {
     Dot,
 }
 
+enum Atom {
+    Empty,
+    Meta(char),
+    Char(char),
+    WordEnd,
+    Class(Charset),
+}
+
 struct RegexParser<'a> {
     input: &'a str,
     pos: usize,
@@ -132,8 +140,12 @@ impl<'a> RegexParser<'a> {
         Ok(result)
     }
 
+    fn rest(&self) -> &'a str {
+        &self.input[self.pos..]
+    }
+
     fn peek(&self) -> Option<char> {
-        self.input[self.pos..].chars().next()
+        self.rest().chars().next()
     }
 
     fn advance(&mut self) -> Option<char> {
@@ -186,7 +198,7 @@ impl<'a> RegexParser<'a> {
 
     /// a?, a*, a+, a{n,m}
     fn parse_quantified(&mut self) -> Result<Regex, String> {
-        let base = self.parse_atom()?;
+        let base = self.parse_primary()?;
 
         let (min, max) = match self.peek() {
             Some('?') => {
@@ -237,7 +249,7 @@ impl<'a> RegexParser<'a> {
     }
 
     /// Parse a single atom (literal, class, group, anchor)
-    fn parse_atom(&mut self) -> Result<Regex, String> {
+    fn parse_primary(&mut self) -> Result<Regex, String> {
         match self.peek() {
             None => Ok(Regex::Empty),
             Some('(') => self.parse_group(),
@@ -260,67 +272,38 @@ impl<'a> RegexParser<'a> {
     /// metacharacters into a single literal. For example, `\+\+\+` becomes `Literal("+++")`.
     fn parse_literal(&mut self) -> Result<Regex, String> {
         let mut lit = String::new();
+        let mut prev_atom_lit_len = 0;
+        let mut prev_atom_pos = self.pos;
 
         loop {
-            match self.peek() {
-                Some('\\') => {
-                    let escape_char = self.input[self.pos + 1..].chars().next();
-                    match escape_char {
-                        // Character classes
-                        Some('w' | 'W' | 'd' | 'D' | 's' | 'S') => {
-                            if lit.is_empty() {
-                                // Start with the class
-                                self.advance(); // consume '\'
-                                return self.parse_escape_as_regex();
-                            } else {
-                                // Return accumulated literal, leave escape for next parse
-                                break;
-                            }
-                        }
-                        // Word boundary
-                        Some('>') => {
-                            if lit.is_empty() {
-                                self.advance(); // consume '\'
-                                self.advance(); // consume '>'
-                                return Ok(Regex::WordEnd);
-                            } else {
-                                break;
-                            }
-                        }
-                        // Simple escape
-                        Some(c) if !c.is_ascii_alphanumeric() => {
-                            // Check if this char would be quantified
-                            let after_escape = self.pos + 1 + c.len_utf8();
-                            if after_escape < self.input.len() && !lit.is_empty() {
-                                let next = self.input[after_escape..].chars().next();
-                                if matches!(next, Some('?' | '*' | '+' | '{')) {
-                                    break;
-                                }
-                            }
-                            self.advance(); // consume '\'
-                            self.advance(); // consume escaped char
-                            lit.push(c);
-                        }
-                        Some(c) => {
-                            return Err(format!("unknown escape sequence '\\{}'", c));
-                        }
-                        None => {
-                            return Err("unexpected end of pattern after backslash".to_string());
-                        }
+            let start = self.pos;
+            match self.parse_atom()? {
+                Atom::Meta('?' | '*' | '+' | '{') => {
+                    // Quantifiers apply to the preceding atom, so we need to pop
+                    // the last atom (= char / escape char) and stop parsing.
+                    if prev_atom_lit_len == 0 {
+                        self.pos = start;
+                    } else {
+                        lit.truncate(prev_atom_lit_len);
+                        self.pos = prev_atom_pos;
                     }
+                    break;
                 }
-                Some(c) if !is_meta_char(c) => {
-                    let next_pos = self.pos + c.len_utf8();
-                    if next_pos < self.input.len() && !lit.is_empty() {
-                        let next = self.input[next_pos..].chars().next();
-                        if matches!(next, Some('?' | '*' | '+' | '{')) {
-                            break;
-                        }
-                    }
+                Atom::Char(c) => {
+                    prev_atom_lit_len = lit.len();
+                    prev_atom_pos = start;
                     lit.push(c);
-                    self.advance();
                 }
-                _ => break,
+                Atom::WordEnd if lit.is_empty() => {
+                    return Ok(Regex::WordEnd);
+                }
+                Atom::Class(cs) if lit.is_empty() => {
+                    return Ok(Regex::CharClass(cs));
+                }
+                _ => {
+                    self.pos = start;
+                    break;
+                }
             }
         }
 
@@ -336,32 +319,6 @@ impl<'a> RegexParser<'a> {
         Ok(Regex::Literal(lit, self.case_insensitive))
     }
 
-    /// \w, \d, etc.
-    fn parse_escape_as_regex(&mut self) -> Result<Regex, String> {
-        match self.advance() {
-            Some('w') => Ok(Regex::CharClass(ASCII_WORD_CHARSET)),
-            Some('W') => {
-                let mut cs = ASCII_WORD_CHARSET;
-                cs.invert();
-                Ok(Regex::CharClass(cs))
-            }
-            Some('d') => Ok(Regex::CharClass(ASCII_DIGIT_CHARSET)),
-            Some('D') => {
-                let mut cs = ASCII_DIGIT_CHARSET;
-                cs.invert();
-                Ok(Regex::CharClass(cs))
-            }
-            Some('s') => Ok(Regex::CharClass(ASCII_WHITESPACE_CHARSET)),
-            Some('S') => {
-                let mut cs = ASCII_WHITESPACE_CHARSET;
-                cs.invert();
-                Ok(Regex::CharClass(cs))
-            }
-            Some(c) => Err(format!("unknown escape sequence '\\{}'", c)),
-            None => Err("unexpected end of pattern after backslash".to_string()),
-        }
-    }
-
     /// (foo), (?:foo), (?i:foo)
     fn parse_group(&mut self) -> Result<Regex, String> {
         self.expect('(')?;
@@ -400,6 +357,19 @@ impl<'a> RegexParser<'a> {
 
     /// [a-z], [^a-z], etc.
     fn parse_char_class(&mut self) -> Result<Regex, String> {
+        fn unexpected_end() -> Result<Regex, String> {
+            Err("unexpected end of pattern in character class".to_string())
+        }
+        fn unexpected_unicode(c: char) -> Result<Regex, String> {
+            Err(format!("non-ASCII character '{c:?}' not supported in character class"))
+        }
+        fn unexpected_class() -> Result<Regex, String> {
+            Err("cannot use character class in character class range".to_string())
+        }
+        fn invalid_range(start: u8, end: u8) -> Result<Regex, String> {
+            Err(format!("invalid character range {:?}-{:?}", start as char, end as char))
+        }
+
         self.expect('[')?;
 
         let negated = if self.peek() == Some('^') {
@@ -419,47 +389,52 @@ impl<'a> RegexParser<'a> {
             self.advance();
         }
 
-        while let Some(c) = self.peek() {
-            if c == ']' {
-                self.advance();
-                break;
-            }
-
-            if c == '\\' {
-                self.advance();
-                let escaped = self.parse_escape_char()?;
-                match escaped {
-                    EscapedChar::Char(b) => charset.set(b, true),
-                    EscapedChar::Class(cs) => charset.merge(&cs),
+        loop {
+            match self.parse_atom()? {
+                Atom::Empty => return unexpected_end(),
+                Atom::Class(cs) => {
+                    charset.merge(&cs);
                 }
-            } else {
-                let start = c as u8;
-                self.advance();
+                Atom::WordEnd => {
+                    charset.set(b'>', true);
+                }
+                Atom::Meta(']') => break,
+                Atom::Meta(c) | Atom::Char(c) => {
+                    if !c.is_ascii() {
+                        return unexpected_unicode(c);
+                    }
 
-                // Check for range
-                if self.peek() == Some('-') && !self.input[self.pos + 1..].starts_with(']') {
-                    self.advance(); // consume -
-                    let end = match self.peek() {
-                        Some('\\') => {
-                            self.advance();
-                            match self.parse_escape_char()? {
-                                EscapedChar::Char(b) => b,
-                                EscapedChar::Class(_) => {
-                                    return Err("cannot use character class in range".to_string());
+                    let start = c as u8;
+                    let mut end = start;
+
+                    // Check for ranges, e.g. [a-z].
+                    // We exclude patterns like [a-], because this implicitly sets 'a'..='a' in this iteration,
+                    // and then '-'..='-' in the next iteration, which is the exact behavior we need.
+                    if let rest = self.rest()
+                        && rest.starts_with("-")
+                        && !rest.starts_with("-]")
+                    {
+                        self.advance(); // consume -
+
+                        match self.parse_atom()? {
+                            Atom::Empty => return unexpected_end(),
+                            Atom::Class(_) => return unexpected_class(),
+                            Atom::WordEnd => {
+                                end = b'>';
+                            }
+                            Atom::Meta(c) | Atom::Char(c) => {
+                                if !c.is_ascii() {
+                                    return unexpected_unicode(c);
                                 }
+                                end = c as u8;
                             }
                         }
-                        Some(c) => {
-                            self.advance();
-                            c as u8
-                        }
-                        None => {
-                            return Err("unexpected end of pattern in character class".to_string());
-                        }
-                    };
+                    }
+
+                    if start > end {
+                        return invalid_range(start, end);
+                    }
                     charset.set_range(start..=end, true);
-                } else {
-                    charset.set(start, true);
                 }
             }
         }
@@ -471,42 +446,50 @@ impl<'a> RegexParser<'a> {
         Ok(Regex::CharClass(charset))
     }
 
-    fn parse_escape_char(&mut self) -> Result<EscapedChar, String> {
-        match self.advance() {
-            Some('w') => Ok(EscapedChar::Class(ASCII_WORD_CHARSET)),
-            Some('W') => {
-                let mut cs = ASCII_WORD_CHARSET;
-                cs.invert();
-                Ok(EscapedChar::Class(cs))
-            }
-            Some('d') => Ok(EscapedChar::Class(ASCII_DIGIT_CHARSET)),
-            Some('D') => {
-                let mut cs = ASCII_DIGIT_CHARSET;
-                cs.invert();
-                Ok(EscapedChar::Class(cs))
+    fn parse_atom(&mut self) -> Result<Atom, String> {
+        let Some(c) = self.advance() else {
+            return Ok(Atom::Empty);
+        };
+
+        match c {
+            '(' | ')' | '[' | ']' | '{' | '}' | '|' | '?' | '*' | '+' | '.' | '^' | '$' => {
+                Ok(Atom::Meta(c))
             }
-            Some('s') => Ok(EscapedChar::Class(ASCII_WHITESPACE_CHARSET)),
-            Some('S') => {
-                let mut cs = ASCII_WHITESPACE_CHARSET;
-                cs.invert();
-                Ok(EscapedChar::Class(cs))
+            '\\' => {
+                let Some(ce) = self.advance() else {
+                    return Err("unexpected end of pattern after backslash".to_string());
+                };
+
+                match ce {
+                    '>' => Ok(Atom::WordEnd),
+                    'w' => Ok(Atom::Class(ASCII_WORD_CHARSET)),
+                    'W' => {
+                        let mut cs = ASCII_WORD_CHARSET;
+                        cs.invert();
+                        Ok(Atom::Class(cs))
+                    }
+                    'd' => Ok(Atom::Class(ASCII_DIGIT_CHARSET)),
+                    'D' => {
+                        let mut cs = ASCII_DIGIT_CHARSET;
+                        cs.invert();
+                        Ok(Atom::Class(cs))
+                    }
+                    's' => Ok(Atom::Class(ASCII_WHITESPACE_CHARSET)),
+                    'S' => {
+                        let mut cs = ASCII_WHITESPACE_CHARSET;
+                        cs.invert();
+                        Ok(Atom::Class(cs))
+                    }
+                    't' => Ok(Atom::Char('\t')),
+                    c if !c.is_ascii_alphanumeric() => Ok(Atom::Char(c)),
+                    c => Err(format!("unknown escape sequence '\\{c}'")),
+                }
             }
-            Some(c) if !c.is_ascii_alphanumeric() => Ok(EscapedChar::Char(c as u8)),
-            Some(c) => Err(format!("unknown escape sequence '\\{}'", c)),
-            None => Err("unexpected end of pattern after backslash".to_string()),
+            _ => Ok(Atom::Char(c)),
         }
     }
 }
 
-enum EscapedChar {
-    Char(u8),
-    Class(Charset),
-}
-
-fn is_meta_char(c: char) -> bool {
-    matches!(c, '(' | ')' | '[' | ']' | '{' | '}' | '|' | '?' | '*' | '+' | '.' | '^' | '$')
-}
-
 struct CodeGen<'a, 'c> {
     compiler: &'c mut Compiler<'a>,
     captures: CaptureList<'a>,

From 2b8ec8c93056465e08930b29037258ab9b6b8364 Mon Sep 17 00:00:00 2001
From: Leonard Hecker <leonard@hecker.io>
Date: Thu, 26 Mar 2026 15:26:41 +0100
Subject: [PATCH 04/11] Detect indented code blocks in Markdown

---
 crates/lsh/definitions/markdown.lsh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/crates/lsh/definitions/markdown.lsh b/crates/lsh/definitions/markdown.lsh
index f5444d32f94d..3fcc7721a157 100644
--- a/crates/lsh/definitions/markdown.lsh
+++ b/crates/lsh/definitions/markdown.lsh
@@ -1,6 +1,12 @@
 #[display_name = "Markdown"]
 #[path = "**/*.md"]
 pub fn markdown() {
+    // Any lines that start with a tab or 4+ spaces are code blocks.
+    if /(?:\t|    ).*/ {
+        yield other;
+        return;
+    }
+
     // Gobble any leading whitespace on the line.
     if /\s+/ {
         yield other;

From 76b58eb944521bdffda9de9c26df70ced781e44f Mon Sep 17 00:00:00 2001
From: Leonard Hecker <leonard@hecker.io>
Date: Fri, 27 Mar 2026 16:22:14 +0100
Subject: [PATCH 05/11] Remove unneeded rust-toolchain.toml file

---
 rust-toolchain.toml | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100644 rust-toolchain.toml

diff --git a/rust-toolchain.toml b/rust-toolchain.toml
deleted file mode 100644
index 5d56faf9ae08..000000000000
--- a/rust-toolchain.toml
+++ /dev/null
@@ -1,2 +0,0 @@
-[toolchain]
-channel = "nightly"

From c51b006e49ff2c13777d57fbe8dfb4267e8d30b0 Mon Sep 17 00:00:00 2001
From: Leonard Hecker <leonard@hecker.io>
Date: Fri, 27 Mar 2026 16:22:39 +0100
Subject: [PATCH 06/11] Fix fast skip loop calculation

---
 crates/lsh/src/compiler/mod.rs | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/crates/lsh/src/compiler/mod.rs b/crates/lsh/src/compiler/mod.rs
index bbeb2814365b..d6b372387358 100644
--- a/crates/lsh/src/compiler/mod.rs
+++ b/crates/lsh/src/compiler/mod.rs
@@ -21,6 +21,7 @@ use std::path::Path;
 
 use stdext::arena::Arena;
 use stdext::collections::BString;
+use stdext::opt_ptr_eq;
 
 pub use self::charset::{Charset, SerializedCharset};
 use self::frontend::*;
@@ -163,7 +164,8 @@ impl<'a> Compiler<'a> {
                 break;
             };
 
-            if let IRI::If { condition, then } = node.borrow().instr {
+            let node = node.borrow();
+            if let IRI::If { condition, then } = node.instr {
                 // For the purpose of computing fast-skips the contents of if conditions are irrelevant,
                 // so skip the subtree. This is actually quite important. This this as an example:
                 //   loop {
@@ -176,7 +178,19 @@ impl<'a> Compiler<'a> {
                 //   }
                 // The inverted charset of the inner /b/ includes "a". If we merge that into the outer
                 // loop's charset we get one that covers all characters, making fast-skips impossible.
-                iter.skip_node(then);
+                // --> Skip the "then" subtree.
+                //
+                // HOWEVER, imagine a condition like this:
+                //   if /a?b/ {}
+                // This compiles to something like:
+                //   if "a"
+                //     .then -> if "b" {}
+                //     .else -> if "b" {}      (aka: .next)
+                // In other words, "then" and "next" point to the same thing.
+                // --> Only skip "then" if it's not the same as "next".
+                if !opt_ptr_eq(Some(then), node.next) {
+                    iter.skip_node(then);
+                }
 
                 match condition {
                     Condition::Cmp { .. } => {}

From 81a2b4d63351bd46e5661e61ca1638f49a5f6771 Mon Sep 17 00:00:00 2001
From: Leonard Hecker <leonard@hecker.io>
Date: Fri, 27 Mar 2026 16:22:56 +0100
Subject: [PATCH 07/11] Optimize any-char (.) expressions

---
 crates/lsh/src/compiler/regex.rs | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/crates/lsh/src/compiler/regex.rs b/crates/lsh/src/compiler/regex.rs
index e4d3c4696253..f32c475c4600 100644
--- a/crates/lsh/src/compiler/regex.rs
+++ b/crates/lsh/src/compiler/regex.rs
@@ -547,12 +547,10 @@ impl<'a, 'c> CodeGen<'a, 'c> {
             }
 
             Regex::Dot => {
-                let cs = Charset::yes();
-                let cs = self.compiler.intern_charset(&cs);
-                let condition = Condition::Charset { cs, min: 1, max: 1 };
-                let if_node = self.compiler.alloc_iri(IRI::If { condition, then: on_match });
-                if_node.borrow_mut().next = Some(on_fail);
-                Ok(if_node)
+                let dst = self.compiler.get_reg(Register::InputOffset);
+                let node = self.compiler.alloc_iri(IRI::AddImm { dst, imm: 1 });
+                node.borrow_mut().next = Some(on_match);
+                Ok(node)
             }
 
             Regex::EndOfLine => {

From ca8e7b260f29a57645f4649de827f2fd32ed33c8 Mon Sep 17 00:00:00 2001
From: Leonard Hecker <leonard@hecker.io>
Date: Fri, 27 Mar 2026 18:25:16 +0100
Subject: [PATCH 08/11] Optimize optional a? expressions

---
 crates/lsh/src/compiler/regex.rs | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/crates/lsh/src/compiler/regex.rs b/crates/lsh/src/compiler/regex.rs
index f32c475c4600..e0041ea5928f 100644
--- a/crates/lsh/src/compiler/regex.rs
+++ b/crates/lsh/src/compiler/regex.rs
@@ -660,10 +660,18 @@ impl<'a, 'c> CodeGen<'a, 'c> {
             return self.emit_charset(cs, min, max, on_match, on_fail);
         }
 
-        // Single-char literal like `#+`: convert to charset for efficient handling.
+        // Single-char literal like `#+`
         if let Regex::Literal(ref s, case_insensitive) = *inner
             && s.len() == 1
         {
+            // Optional single char: `a?`
+            // It can be trivially translated to a Prefix/PrefixInsensitive check
+            // where even on_fail is a success and is thus connected to on_match.
+            if min == 0 && max == 1 {
+                return self.emit(inner, on_match, on_match);
+            }
+
+            // Otherwise, we must translate to a Charset match.
             let b = s.as_bytes()[0];
             let mut cs = Charset::no();
             if case_insensitive {

From 8fe1871fba670a68efd0aac06c7a162daf1dee2b Mon Sep 17 00:00:00 2001
From: Leonard Hecker <leonard@hecker.io>
Date: Fri, 27 Mar 2026 18:26:21 +0100
Subject: [PATCH 09/11] Optimize dual charsets [-+]

---
 crates/lsh/src/compiler/charset.rs | 11 ++++
 crates/lsh/src/compiler/regex.rs   | 81 +++++++++++++++++++++---------
 2 files changed, 67 insertions(+), 25 deletions(-)

diff --git a/crates/lsh/src/compiler/charset.rs b/crates/lsh/src/compiler/charset.rs
index ca772904b8b9..e7e301e3253d 100644
--- a/crates/lsh/src/compiler/charset.rs
+++ b/crates/lsh/src/compiler/charset.rs
@@ -35,6 +35,17 @@ impl Charset {
         (self.bits[hi] & (1 << lo)) != 0
     }
 
+    pub fn get_and_reset_lowest(&mut self) -> Option<u8> {
+        for (hi, bits) in self.bits.iter_mut().enumerate() {
+            if *bits != 0 {
+                let lo = bits.trailing_zeros() as usize;
+                *bits &= !(1 << lo);
+                return Some((hi * WORD_BITS + lo) as u8);
+            }
+        }
+        None
+    }
+
     pub fn covers_none(&self) -> bool {
         self.bits.iter().all(|&b| b == usize::MIN)
     }
diff --git a/crates/lsh/src/compiler/regex.rs b/crates/lsh/src/compiler/regex.rs
index e0041ea5928f..e2c6cd11f77c 100644
--- a/crates/lsh/src/compiler/regex.rs
+++ b/crates/lsh/src/compiler/regex.rs
@@ -43,6 +43,8 @@
 //! - The `parse()` function wires its generated IR into the provided destination nodes.
 //!   Don't pass nodes that are already part of the IR graph.
 
+use std::slice;
+
 use stdext::collections::BVec;
 
 use super::*;
@@ -524,27 +526,10 @@ impl<'a, 'c> CodeGen<'a, 'c> {
             Regex::Empty => Ok(on_match),
 
             Regex::Literal(s, case_insensitive) => {
-                if s.is_empty() {
-                    return Ok(on_match);
-                }
-                let s = self.compiler.intern_string(s);
-                let condition = if *case_insensitive {
-                    Condition::PrefixInsensitive(s)
-                } else {
-                    Condition::Prefix(s)
-                };
-                let if_node = self.compiler.alloc_iri(IRI::If { condition, then: on_match });
-                if_node.borrow_mut().next = Some(on_fail);
-                Ok(if_node)
+                self.emit_literal(s, *case_insensitive, on_match, on_fail)
             }
 
-            Regex::CharClass(cs) => {
-                let cs = self.compiler.intern_charset(cs);
-                let condition = Condition::Charset { cs, min: 1, max: u32::MAX };
-                let if_node = self.compiler.alloc_iri(IRI::If { condition, then: on_match });
-                if_node.borrow_mut().next = Some(on_fail);
-                Ok(if_node)
-            }
+            Regex::CharClass(cs) => self.emit_charset(cs, 1, 1, on_match, on_fail),
 
             Regex::Dot => {
                 let dst = self.compiler.get_reg(Register::InputOffset);
@@ -564,11 +549,7 @@ impl<'a, 'c> CodeGen<'a, 'c> {
             Regex::WordEnd => {
                 // \> is a zero-width assertion: succeeds if NOT followed by a word char.
                 // We invert the logic: check for word char, swap success/failure branches.
-                let cs = self.compiler.intern_charset(&ASCII_WORD_CHARSET);
-                let condition = Condition::Charset { cs, min: 1, max: 1 };
-                let if_node = self.compiler.alloc_iri(IRI::If { condition, then: on_fail });
-                if_node.borrow_mut().next = Some(on_match);
-                Ok(if_node)
+                self.emit_charset(&ASCII_WORD_CHARSET, 1, 1, on_fail, on_match)
             }
 
             Regex::Concat(parts) => {
@@ -706,6 +687,24 @@ impl<'a, 'c> CodeGen<'a, 'c> {
         Ok(current)
     }
 
+    fn emit_literal(
+        &mut self,
+        s: &str,
+        case_insensitive: bool,
+        on_match: IRCell<'a>,
+        on_fail: IRCell<'a>,
+    ) -> Result<IRCell<'a>, String> {
+        if s.is_empty() {
+            return Ok(on_match);
+        }
+        let s = self.compiler.intern_string(s);
+        let condition =
+            if case_insensitive { Condition::PrefixInsensitive(s) } else { Condition::Prefix(s) };
+        let if_node = self.compiler.alloc_iri(IRI::If { condition, then: on_match });
+        if_node.borrow_mut().next = Some(on_fail);
+        Ok(if_node)
+    }
+
     fn emit_charset(
         &mut self,
         cs: &Charset,
@@ -714,12 +713,44 @@ impl<'a, 'c> CodeGen<'a, 'c> {
         on_match: IRCell<'a>,
         on_fail: IRCell<'a>,
     ) -> Result<IRCell<'a>, String> {
+        let mut next = if min == 0 { on_match } else { on_fail };
+
+        // If the expression is of form [a], [ab], [aA], or [aAbB] it is
+        // worth translating it to a Prefix/PrefixInsensitive check.
+        // The [a] and [aA] cases are an obvious improvement, but even the other
+        // two cases are worth it due to the shorter instruction encoding.
+        if max == 1 {
+            let mut cs = cs.clone();
+            let mut chars = [(0u8, false); 2];
+            let mut count = 0;
+
+            for slot in &mut chars {
+                let Some(mut idx) = cs.get_and_reset_lowest() else { break };
+                let case_insensitive = idx.is_ascii_uppercase() && cs.get(idx.to_ascii_lowercase());
+                if case_insensitive {
+                    idx = idx.to_ascii_lowercase();
+                    cs.set(idx, false);
+                }
+                *slot = (idx, case_insensitive);
+                count += 1;
+            }
+
+            if count > 0 && cs.covers_none() {
+                for &(ch, insensitive) in chars[..count].iter().rev() {
+                    let s = unsafe { str::from_utf8_unchecked(slice::from_ref(&ch)) };
+                    let node = self.emit_literal(s, insensitive, on_match, next)?;
+                    next = node;
+                }
+                return Ok(next);
+            }
+        }
+
         let cs = self.compiler.intern_charset(cs);
         let condition = Condition::Charset { cs, min, max };
         let if_node = self.compiler.alloc_iri(IRI::If { condition, then: on_match });
 
         // min=0 implies that it cannot fail. Remove `on_fail` to allow for later optimizations.
-        if_node.borrow_mut().next = Some(if min == 0 { on_match } else { on_fail });
+        if_node.borrow_mut().next = Some(next);
 
         Ok(if_node)
     }

From 07130a7e5e0fed489ccd9d1ce1b15a25602c1311 Mon Sep 17 00:00:00 2001
From: Leonard Hecker <leonard@hecker.io>
Date: Fri, 27 Mar 2026 18:32:49 +0100
Subject: [PATCH 10/11] Inline call/ret instruction jumps

---
 crates/lsh/src/compiler/backend.rs | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/crates/lsh/src/compiler/backend.rs b/crates/lsh/src/compiler/backend.rs
index cbe6f84c51cb..8a5016147c98 100644
--- a/crates/lsh/src/compiler/backend.rs
+++ b/crates/lsh/src/compiler/backend.rs
@@ -352,11 +352,29 @@ impl<'a> Backend<'a> {
 
                 // If the next instruction was already serialized (e.g. this is some form of loop),
                 // simply jump to the already serialized code. We're done here. Nothing new will come after this.
+                //
+                // If the destination is call/ret instruction we can just inline it.
+                // Otherwise, it'd be like jumping to a jump.
+                //
+                // TODO: If you think about it, this should kinda go into optimizer.rs, because it could
+                // do optimizations across entire instruction sequences (= it could do inlining!).
+                // But optimizer.rs doesn't have a linearized view of the assembly so it can't do this.
                 if ir.offset != usize::MAX {
-                    self.push_instruction(MovImm {
-                        dst: Register::ProgramCounter,
-                        imm: ir.offset as u32,
-                    });
+                    match ir.instr {
+                        IRI::Call { name } => {
+                            let tgt = self.dst_by_name(name) as u32;
+                            self.push_instruction(Call { tgt });
+                        }
+                        IRI::Return => {
+                            self.push_instruction(Return);
+                        }
+                        _ => {
+                            self.push_instruction(MovImm {
+                                dst: Register::ProgramCounter,
+                                imm: ir.offset as u32,
+                            });
+                        }
+                    }
                     break;
                 }
             }

From 8efe99472bcc93e25e3aacc093861b1652dfc3cc Mon Sep 17 00:00:00 2001
From: Leonard Hecker <leonard@hecker.io>
Date: Fri, 27 Mar 2026 18:38:45 +0100
Subject: [PATCH 11/11] Silence, wench

---
 crates/edit/src/bin/edit/draw_menubar.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crates/edit/src/bin/edit/draw_menubar.rs b/crates/edit/src/bin/edit/draw_menubar.rs
index 5d1acc2114a7..1ab702b64644 100644
--- a/crates/edit/src/bin/edit/draw_menubar.rs
+++ b/crates/edit/src/bin/edit/draw_menubar.rs
@@ -53,6 +53,7 @@ fn draw_menu_file(ctx: &mut Context, state: &mut State) {
             state.wants_file_picker = StateFilePicker::SaveAs;
         }
     }
+    #[allow(irrefutable_let_patterns)]
     if let path = Settings::borrow().path.as_path()
         && !path.as_os_str().is_empty()
         && ctx.menubar_menu_button(loc(LocId::FilePreferences), 'P', vk::NULL)