From 2fc81f5a5ea89afdf9ada726a440f6df8b54fd15 Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Wed, 25 Mar 2026 20:53:27 +0100 Subject: [PATCH 01/11] Complete the initial LSH implementation --- .vscode/launch.json | 6 +- assets/highlighting-tests/bash.sh | 71 ++++++++ assets/highlighting-tests/batch.bat | 41 +++++ assets/highlighting-tests/html.html | 51 ++++++ assets/highlighting-tests/markdown.md | 75 ++++++++ assets/highlighting-tests/powershell.ps1 | 78 +++++++++ assets/highlighting-tests/properties.conf | 13 ++ assets/highlighting-tests/xml.xml | 39 +++++ assets/highlighting-tests/yaml.yml | 43 +++++ crates/edit/benches/lib.rs | 32 +++- crates/edit/src/buffer/mod.rs | 23 ++- crates/lsh/definitions/json.lsh | 39 +++++ crates/lsh/definitions/lsh.lsh | 50 ++++++ crates/lsh/definitions/markdown.lsh | 204 ++++++++++++++++++++++ crates/lsh/definitions/powershell.lsh | 64 +++++++ crates/lsh/definitions/properties.lsh | 55 ++++++ crates/lsh/definitions/utility.lsh | 21 +++ crates/lsh/definitions/xml.lsh | 78 +++++++++ crates/lsh/definitions/yaml.lsh | 78 +++++++++ 19 files changed, 1056 insertions(+), 5 deletions(-) create mode 100644 assets/highlighting-tests/bash.sh create mode 100644 assets/highlighting-tests/batch.bat create mode 100644 assets/highlighting-tests/html.html create mode 100644 assets/highlighting-tests/markdown.md create mode 100644 assets/highlighting-tests/powershell.ps1 create mode 100644 assets/highlighting-tests/properties.conf create mode 100644 assets/highlighting-tests/xml.xml create mode 100644 assets/highlighting-tests/yaml.yml create mode 100644 crates/lsh/definitions/json.lsh create mode 100644 crates/lsh/definitions/lsh.lsh create mode 100644 crates/lsh/definitions/markdown.lsh create mode 100644 crates/lsh/definitions/powershell.lsh create mode 100644 crates/lsh/definitions/properties.lsh create mode 100644 crates/lsh/definitions/utility.lsh create mode 100644 crates/lsh/definitions/xml.lsh create mode 100644 crates/lsh/definitions/yaml.lsh diff --git a/.vscode/launch.json b/.vscode/launch.json index 7142960c4f9d..a62e8d6c3a77 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -10,7 +10,7 @@ "program": "${workspaceFolder}/target/debug/edit", "cwd": "${workspaceFolder}", "args": [ - "${workspaceFolder}/crates/edit/src/bin/edit/main.rs" + "${workspaceFolder}/assets/highlighting-tests/markdown.md" ], }, { @@ -23,7 +23,7 @@ "program": "${workspaceFolder}/target/debug/edit", "cwd": "${workspaceFolder}", "args": [ - "${workspaceFolder}/crates/edit/src/bin/edit/main.rs" + "${workspaceFolder}/assets/highlighting-tests/markdown.md" ], }, { @@ -40,7 +40,7 @@ "program": "${workspaceFolder}/target/debug/edit", "cwd": "${workspaceFolder}", "args": [ - "${workspaceFolder}/crates/edit/src/bin/edit/main.rs" + "${workspaceFolder}/assets/highlighting-tests/markdown.md" ], }, { diff --git a/assets/highlighting-tests/bash.sh b/assets/highlighting-tests/bash.sh new file mode 100644 index 000000000000..dfd5238724db --- /dev/null +++ b/assets/highlighting-tests/bash.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash + +# This is a comment + +readonly VAR1="Hello" # String literal +VAR2=42 # Integer literal +VAR3=$((VAR2 + 8)) # Arithmetic expansion +VAR4=$(echo "World") # Command substitution + +function greet() { # Function definition + local name="$1" # Local variable, parameter expansion + echo "${VAR1}, $name! $VAR4" # String, parameter expansion, variable +} + +greet "User" # Function call, string literal + +if [[ $VAR2 -gt 40 && $VAR3 -eq 50 ]]; then # Conditional, test, operators + echo "Numbers are correct" # String literal +elif (( VAR2 < 40 )); then # Arithmetic test + echo 'VAR2 is less than 40' # Single-quoted string +else + echo "Other case" +fi + +for i in {1..3}; do # Brace expansion, for loop + echo "Loop $i" # String, variable +done + +case "$VAR4" in # Case statement + World) echo "It's World";; # Pattern, string + *) echo "Unknown";; # Wildcard +esac + +arr=(one two three) # Array +echo "${arr[1]}" # Array access + +declare -A assoc # Associative array +assoc[key]="value" +echo "${assoc[key]}" + +# Here document +cat < /dev/null + +# Background job +sleep 1 & + +# Arithmetic assignment +let VAR2+=1 + +# Process substitution +diff <(echo foo) <(echo bar) + +# Command grouping +{ echo "Group 1"; echo "Group 2"; } + +# Escaped characters +echo "A quote: \" and a backslash: \\" + +# End of file diff --git a/assets/highlighting-tests/batch.bat b/assets/highlighting-tests/batch.bat new file mode 100644 index 000000000000..962ef66007af --- /dev/null +++ b/assets/highlighting-tests/batch.bat @@ -0,0 +1,41 @@ +@echo off +REM --- String, Variable, Label, Command, Operator, Number, Delimiter, Comment --- + +:: Label +:Start + +:: Variable assignment and usage +set "VAR1=Hello" +set VAR2=World + +:: String with spaces and special characters +set "STR=Batch ^& CMD!" + +:: Arithmetic operation (number, operator) +set /a SUM=5+10 + +:: IF statement (keyword, operator, string, variable) +if "%VAR1%"=="Hello" ( + echo %VAR1%, %VAR2%! %STR% +) else ( + echo Not matched! +) + +:: FOR loop (keyword, variable, delimiter, string) +for %%F in (*.bat) do ( + echo Found file: %%F +) + +:: CALL command (keyword, label) +call :SubRoutine + +:: GOTO command (keyword, label) +goto :End + +:: Subroutine with parameter +:SubRoutine +echo In subroutine with SUM=%SUM% +goto :eof + +:End +REM End of script diff --git a/assets/highlighting-tests/html.html b/assets/highlighting-tests/html.html new file mode 100644 index 000000000000..35682eee1b34 --- /dev/null +++ b/assets/highlighting-tests/html.html @@ -0,0 +1,51 @@ + + + + + + + HTML Syntax Test & Demo + + + + + +

Heading

+ + +

Text with < > & " ' A A entities

+
+
+ Description + + + + + + +
+ Inline bold text +
    +
  • Item 1
  • +
  • Item 2
  • +
+
+ + + + + + diff --git a/assets/highlighting-tests/markdown.md b/assets/highlighting-tests/markdown.md new file mode 100644 index 000000000000..c4845eb68014 --- /dev/null +++ b/assets/highlighting-tests/markdown.md @@ -0,0 +1,75 @@ +# H1 + +## H2 + +### H3 + +#### H4 + +##### H5 + +###### H6 + +regular +*italic* +_italic_ +**bold** +__bold__ +***bold italic*** +**_bold italic_** +__*bold italic*__ +~~strikethrough~~ +`inline code` +`` `literal` `` +\*not\* \_italic\_ # not a heading + +* Unordered item + - Nested item + * Third level +* Task list: + * [ ] To do + * [x] Done + * [ ] *Mixed* **formatting** with `code` +1. Ordered can start anywhere +2. …like here (intentional) + 1. Nested ordered + 2. Multiple paragraphs within a list item: + Still the same item. + +> A single-level quote +> +> > A nested quote with **bold** and `code` +> +> * List in a quote +> * [Link in quote](#links) + +Inline: [Example](https://example.com "Example Title") +Reference: [Ref Link][ref] and [Another][another-ref] +Relative: [This section](#tables) +Footnote: [^note] +[ref]: https://example.com +[another-ref]: https://github.com +[^note]: This is a footnote with **formatting** and a [link](https://github.com). + +Inline: ![Alt text](https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png "GitHub Mark") +Reference: ![Logo][logo-ref] +[logo-ref]: https://github.githubassets.com/images/modules/logos_page/GitHub-Logo.png "GitHub Logo" + +| Left | Center | Right | +| :---------- | :--------: | ----: | +| *italic* | `code` | 123 | +| **bold** | ~~strike~~ | 4.56 | +| [link][ref] | :tada: | `end` | + +```bash +# Shell +echo "Hello, world" | tr a-z A-Z +``` + +```json +{ + "name": "gfm-kitchen-sink", + "private": true, + "scripts": { "test": "echo ok" } +} +``` diff --git a/assets/highlighting-tests/powershell.ps1 b/assets/highlighting-tests/powershell.ps1 new file mode 100644 index 000000000000..e2bb01f39c6c --- /dev/null +++ b/assets/highlighting-tests/powershell.ps1 @@ -0,0 +1,78 @@ +# Single-line comment + +<# +Multi-line +comment +#> + +function Get-SampleData { + param( + [string]$Name = "World", # String literal, parameter + [int]$Count = 3 + ) + + $array = @(1, 2, 3) # Array literal + $hashtable = @{ Key1 = 'Value1'; Key2 = 42 } # Hashtable literal + + $nullVar = $null + $boolTrue = $true + $boolFalse = $false + + $regexMatch = "abc123" -match '\d+' # Regex literal + + for ($i = 0; $i -lt $Count; $i++) { + Write-Host "Hello, $Name! Iteration: $i" # Variable interpolation, string + } + + if ($hashtable.Key2 -eq 42) { + Write-Output "Hashtable value is 42" + } + elseif ($hashtable.Key2 -gt 40) { + Write-Output "Hashtable value is greater than 40" + } + else { + Write-Output "Hashtable value is less than or equal to 40" + } + + switch ($Name) { + "World" { Write-Host "Default name used." } + default { Write-Host "Custom name: $Name" } + } + + try { + throw "An error occurred" + } + catch { + Write-Warning $_ + } + finally { + Write-Verbose "Finally block executed" + } + + $script:globalVar = 99 # Scope modifier + + # Here-String + $hereString = @" +This is a here-string. +Name: $Name +"@ + + return $hereString +} + +# Command invocation, pipeline, splatting +$paramSplat = @{ + Name = 'PowerShell' + Count = 2 +} +Get-SampleData @paramSplat | Out-File -FilePath "./output.txt" + +# Type literal, member access, method call +[System.DateTime]::Now.ToString("yyyy-MM-dd") + +# Subexpression +Write-Host "2 + 2 = $($array[0] + $array[1])" + +# Command substitution +$pwdPath = $(Get-Location).Path +Write-Host "Current directory: $pwdPath" diff --git a/assets/highlighting-tests/properties.conf b/assets/highlighting-tests/properties.conf new file mode 100644 index 000000000000..3bd3b62693ba --- /dev/null +++ b/assets/highlighting-tests/properties.conf @@ -0,0 +1,13 @@ +# General Settings +[General] +enabled = true +debug = false +log_level = info +max_connections = 1000 + +[SSL] +enabled = true +cert_file = /etc/ssl/certs/server.crt +key_file = /etc/ssl/private/server.key +protocols = TLSv1.2, TLSv1.3 # Supported protocols: "TLSv1.2" and "TLSv1.3" +cipher_suite = "ECDHE-RSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-GCM-SHA256" diff --git a/assets/highlighting-tests/xml.xml b/assets/highlighting-tests/xml.xml new file mode 100644 index 000000000000..ad2c2aa09a0d --- /dev/null +++ b/assets/highlighting-tests/xml.xml @@ -0,0 +1,39 @@ + + +]> + + + + + + + Text & < > " ' content + &custom; + + + + + + + Text before inline text after + + + A A 😀 + + + + + + !@#$%^*()_+-={}[]|\:;"'<>,. + + + + + + + diff --git a/assets/highlighting-tests/yaml.yml b/assets/highlighting-tests/yaml.yml new file mode 100644 index 000000000000..6dbf0abada91 --- /dev/null +++ b/assets/highlighting-tests/yaml.yml @@ -0,0 +1,43 @@ +# This is a comment +--- +string: "Hello, world!" +plain: plainValue +multiline: | + This is a + true + multiline string. +folded: > + This is a + true + folded string. +number_int: 42 +number_float: 3.1415 +number_scientific: 1.23e45 +number_negative: -7 +boolean_true: true +boolean_false: false +null_value: null +explicit_null: ~ +date: 2024-06-01 +timestamp: 2024-06-01T12:34:56Z +confusable_string_number: 1.23e45 1.23e45 # This is a comment +sequence: + - item1 + - item2 + - 3 + - true +mapping: + key1: value1 + key2: value2 +nested: + - name: Alice + age: 30 + married: false + - name: Bob + age: 25 + married: true +empty_sequence: [foo, 123, bar] +empty_mapping: { foo: bar } +literal_colon: "value:with:colons" +literal_dash: "-not-a-sequence" +special_chars: "Tab:\t Newline:\n Unicode:\u2713" diff --git a/crates/edit/benches/lib.rs b/crates/edit/benches/lib.rs index 45c4d52793bf..372e8ebaebac 100644 --- a/crates/edit/benches/lib.rs +++ b/crates/edit/benches/lib.rs @@ -3,11 +3,12 @@ use std::hint::black_box; use std::io::Cursor; +use std::path::Path; use std::{mem, vec}; use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; use edit::helpers::*; -use edit::{buffer, hash, json, oklab, simd, unicode}; +use edit::{buffer, hash, json, lsh, oklab, simd, unicode}; use stdext::arena::{self, scratch_arena}; use stdext::collections::BVec; use stdext::float::parse_f64_approx; @@ -187,6 +188,34 @@ fn bench_json(c: &mut Criterion) { ); } +fn bench_lsh(c: &mut Criterion) { + let bytes = include_bytes!("../../../assets/highlighting-tests/markdown.md"); + let bytes = &bytes[..]; + let lang = lsh::LANGUAGES.iter().find(|lang| lang.id == "markdown").unwrap(); + let highlighter = lsh::Highlighter::new(black_box(&bytes), lang); + + c.benchmark_group("lsh").throughput(Throughput::Bytes(bytes.len() as u64)).bench_function( + "markdown", + |b| { + b.iter(|| { + let mut h = highlighter.clone(); + loop { + let scratch = scratch_arena(None); + let res = h.parse_next_line(&scratch); + if res.is_empty() { + break; + } + } + }) + }, + ); + + c.benchmark_group("lsh").bench_function("process_file_associations", |b| { + let path = Path::new("/some/long/path/to/file/foo.bar.foo.bar.foo.bar"); + b.iter(|| lsh::process_file_associations(lsh::FILE_ASSOCIATIONS, black_box(path))) + }); +} + fn bench_oklab(c: &mut Criterion) { c.benchmark_group("oklab") .bench_function("StraightRgba::as_oklab", |b| { @@ -294,6 +323,7 @@ fn bench(c: &mut Criterion) { bench_glob(c); bench_hash(c); bench_json(c); + bench_lsh(c); bench_oklab(c); bench_simd_lines_fwd(c); bench_simd_memchr2(c); diff --git a/crates/edit/src/buffer/mod.rs b/crates/edit/src/buffer/mod.rs index 1e2b46110013..4ec01f103a7f 100644 --- a/crates/edit/src/buffer/mod.rs +++ b/crates/edit/src/buffer/mod.rs @@ -42,7 +42,7 @@ use stdext::{ReplaceRange as _, arena_write_fmt, minmax, slice_as_uninit_mut, sl use crate::cell::SemiRefCell; use crate::clipboard::Clipboard; use crate::document::{ReadableDocument, WriteableDocument}; -use crate::framebuffer::{Framebuffer, IndexedColor}; +use crate::framebuffer::{Attributes, Framebuffer, IndexedColor}; use crate::helpers::*; use crate::lsh::cache::HighlighterCache; use crate::lsh::{HighlightKind, Highlighter, Language}; @@ -2133,13 +2133,31 @@ impl TextBuffer { let color = match curr.kind { HighlightKind::Other => None, HighlightKind::Comment => Some(IndexedColor::Green), + HighlightKind::Method => Some(IndexedColor::BrightYellow), + HighlightKind::String => Some(IndexedColor::BrightRed), + HighlightKind::Variable => Some(IndexedColor::BrightCyan), + HighlightKind::ConstantLanguage => Some(IndexedColor::BrightBlue), HighlightKind::ConstantNumeric => Some(IndexedColor::BrightGreen), HighlightKind::KeywordControl => Some(IndexedColor::BrightMagenta), + HighlightKind::KeywordOther => Some(IndexedColor::BrightBlue), + HighlightKind::MarkupBold => None, HighlightKind::MarkupChanged => Some(IndexedColor::BrightBlue), HighlightKind::MarkupDeleted => Some(IndexedColor::BrightRed), + HighlightKind::MarkupHeading => Some(IndexedColor::BrightBlue), HighlightKind::MarkupInserted => Some(IndexedColor::BrightGreen), + HighlightKind::MarkupItalic => None, + HighlightKind::MarkupLink => None, + HighlightKind::MarkupList => Some(IndexedColor::BrightBlue), + HighlightKind::MarkupStrikethrough => None, HighlightKind::MetaHeader => Some(IndexedColor::BrightBlue), }; + let attr = match curr.kind { + HighlightKind::MarkupBold => Some(Attributes::Bold), + HighlightKind::MarkupItalic => Some(Attributes::Italic), + HighlightKind::MarkupLink => Some(Attributes::Underlined), + HighlightKind::MarkupStrikethrough => Some(Attributes::Strikethrough), + _ => None, + }; // Handle the case where the highlight spans multiple visual lines // due to word wrapping. The range is [beg, end) in terms of offsets, @@ -2201,6 +2219,9 @@ impl TextBuffer { if let Some(color) = color { fb.blend_fg(rect, fb.indexed(color)); } + if let Some(attr) = attr { + fb.replace_attr(rect, Attributes::All, attr); + } } } } diff --git a/crates/lsh/definitions/json.lsh b/crates/lsh/definitions/json.lsh new file mode 100644 index 000000000000..6b8f296b441b --- /dev/null +++ b/crates/lsh/definitions/json.lsh @@ -0,0 +1,39 @@ +#[display_name = "JSON"] +#[path = "**/*.json"] +#[path = "**/*.jsonc"] +pub fn json() { + until /$/ { + yield other; + + if /\/\/.*/ { + yield comment; + } else if /\/\*/ { + loop { + yield comment; + await input; + if /\*\// { + yield comment; + break; + } + } + } else if /"/ { + double_quote_string(); + } else if /true|false|null/ { + if /\w+/ { + // Not a keyword after all. + } else { + yield constant.language; + } + } else if /-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?/ { + if /\w+/ { + // Not a number after all. + } else { + yield constant.numeric; + } + } else if /\w+/ { + // Invalid token? Skip. + } + + yield other; + } +} diff --git a/crates/lsh/definitions/lsh.lsh b/crates/lsh/definitions/lsh.lsh new file mode 100644 index 000000000000..e9cf958916c8 --- /dev/null +++ b/crates/lsh/definitions/lsh.lsh @@ -0,0 +1,50 @@ +#[display_name = "LSH"] +#[path = "**/*.lsh"] +pub fn lsh() { + until /$/ { + yield other; + + if /\/\/.*/ { + yield comment; + } else if /pub|fn|await/ { + if /\w+/ { + yield other; + } else { + yield keyword.other; + } + } else if /if|else|until|loop|break|continue/ { + if /\w+/ { + yield other; + } else { + yield keyword.control; + } + } else if /yield/ { + if /\w+/ { + yield other; + } else { + yield keyword.other; + if /\s+/ { + // Gobble space to the argument + yield other; + } + if /[\w.]+/ { + // The yield argument + yield markup.link; + } + } + } else if /\// { + until /$/ { + if /\\./ { + // Skip escape char + } else if /\// { + yield string; + break; + } + } + } else if /"/ { + double_quote_string(); + } + + yield other; + } +} diff --git a/crates/lsh/definitions/markdown.lsh b/crates/lsh/definitions/markdown.lsh new file mode 100644 index 000000000000..f5444d32f94d --- /dev/null +++ b/crates/lsh/definitions/markdown.lsh @@ -0,0 +1,204 @@ +#[display_name = "Markdown"] +#[path = "**/*.md"] +pub fn markdown() { + // Gobble any leading whitespace on the line. + if /\s+/ { + yield other; + } + + if /#+\s+.*/ { + yield markup.heading; + } else if />.*/ { + yield comment; + } else if /```/ { + // NOTE: These checks are sorted alphabetically. + if /(?i:diff)/ { + loop { + await input; + if /\s*```/ { + return; + } else { + diff(); + // diff() may not have eaten that line. + // It doesn't have to. That's our responsibility. + if /.*/ {} + } + } + } else if /(?i:json)/ { + loop { + await input; + if /\s*```/ { + return; + } else { + json(); + if /.*/ {} + } + } + } else if /(?i:yaml)/ { + loop { + await input; + if /\s*```/ { + return; + } else { + yaml(); + if /.*/ {} + } + } + } else if /(?i:pwsh|powershell)/ { + loop { + await input; + if /\s*```/ { + return; + } else { + powershell(); + if /.*/ {} + } + } + } else { + loop { + await input; + if /\s*```.*/ { + return; + } else if /.*/ { + // Gobble the rest of the line with no highlighting. + } + } + } + } else { + if /[\*-]\s+/ { + yield markup.list; + } + + // The structure of inline emphasis, etc., matchers is + // awfully wrong, but parsing Markdown is also awful in general. + // In fact I think it's categorically impossible to implement like this, + // because it requires lookahead across potentially infinite lines: + // The Markdown spec describes its "parsing strategy" as "two phase" + // where the first phase segments the document into block structures. + // For instance, a `comment` may span multiple lines, _unless_ + // there's a paragraph break inbetween, e.g. due to a ">" comment. + until /$/ { + yield other; + + if /\[/ { + // Anything inside [...] is a link text and is colored like a string. + yield other; + if /[^\]]*/ { + yield string; + + // Now look for the link target in parentheses. + if /\]\(/ { + yield other; + if /[^\s\)]+/ { + yield markup.link; + } + + if /\s+/ { + } + + // The link may have a title in quotes. + yield other; + if /"/ { + if /[^"]+/ { + } + if /"/ { + yield string; + } + } + } + + } + } else if /__/ { + if /_+/ { + // "_____" = no hit + } else if /\s+/ { + // "__ foo __" = no hit + } else { + until /$/ { + if /\\./ { + // gobble escape char + } else if /[^\s_]+__/ { + yield markup.bold; + break; + } else if /__/ { + break; + } + } + } + } else if /\*\*/ { + if /\s+/ { + // "** foo **" = no hit + } else { + until /$/ { + if /\\./ { + // gobble escape char + } else if /[^\s*]+\*\*/ { + yield markup.bold; + break; + } else if /\*\*/ { + break; + } + } + } + } else if /_/ { + if /\s+/ { + // "_ foo _" = no hit + } else { + until /$/ { + if /\\./ { + // gobble escape char + } else if /[^\s_]+_/ { + yield markup.italic; + break; + } else if /_/ { + break; + } + } + } + } else if /\*/ { + if /\s+/ { + // "* foo *" = no hit + } else { + until /$/ { + if /\\./ { + // gobble escape char + } else if /[^\s*]+\*/ { + yield markup.italic; + break; + } else if /\*/ { + break; + } + } + } + } else if /~~/ { + if /\s+/ { + // "~~ foo ~~" = no hit + } else { + until /$/ { + if /\\./ { + // gobble escape char + } else if /[^\s~]+~~/ { + yield markup.strikethrough; + break; + } else if /~~/ { + break; + } + } + } + } else if /```/ { + until /$/ { if /```/ { break; } } + yield string; + } else if /``/ { + until /$/ { if /``/ { break; } } + yield string; + } else if /`/ { + until /$/ { if /`/ { break; } } + yield string; + } else if /\\./ { + // Gobble escape char + } + + yield other; + } + } +} diff --git a/crates/lsh/definitions/powershell.lsh b/crates/lsh/definitions/powershell.lsh new file mode 100644 index 000000000000..ce4bf5e45747 --- /dev/null +++ b/crates/lsh/definitions/powershell.lsh @@ -0,0 +1,64 @@ +#[display_name = "PowerShell"] +#[path = "**/*.ps1"] +#[path = "**/*.psd1"] +#[path = "**/*.psm1"] +pub fn powershell() { + until /$/ { + yield other; + + if /#.*/ { + yield comment; + } else if /<#/ { + loop { + yield comment; + if /#>/ { yield comment; break; } + await input; + } + } else if /'/ { + loop { + yield string; + if /\\./ {} + else if /'/ { yield string; break; } + await input; + } + } else if /@"/ { + loop { + yield string; + if /\\./ {} + else if /"@/ { yield string; break; } + await input; + } + } else if /"/ { + loop { + yield string; + if /\\./ {} + else if /"/ { yield string; break; } + await input; + } + } else if /function|param/ { + yield keyword.other; + } else if /elseif|else|if|for|switch|default|throw|try|catch|finally/ { + yield keyword.control; + } else if /-?(?:\d+\.?\d*|\.\d+)(?:[eE][+-]?\d+)?/ { + if /[\w\-]+/ { + yield method; + } else { + yield constant.numeric; + } + } else if /\$false|\$true|\$null/ { + if /\w+/ { + yield variable; + } else { + yield constant.language; + } + } else if /[$@][\w\d?:]+/ { + yield variable; + } else if /[\w\d?:]+-[\w\d?:-]*/ { + yield method; + } else if /[\w\d?:]+/ { + // Gobble any other tokens that should not be highlighted + } + + yield other; + } +} diff --git a/crates/lsh/definitions/properties.lsh b/crates/lsh/definitions/properties.lsh new file mode 100644 index 000000000000..fe109b406dd1 --- /dev/null +++ b/crates/lsh/definitions/properties.lsh @@ -0,0 +1,55 @@ +#[display_name = "Properties"] +// ini +#[path = "**/*.ini"] +// git +#[path = "**/.config/git/config"] +#[path = "**/.git/config"] +#[path = "**/gitconfig"] +#[path = "**/*.gitattributes"] +#[path = "**/*.gitconfig"] +#[path = "**/*.gitmodules"] +// things that VS Code calls "properties" files +#[path = "**/*.cfg"] +#[path = "**/*.conf"] +#[path = "**/*.directory"] +#[path = "**/*.editorconfig"] +#[path = "**/*.properties"] +#[path = "**/*.repo"] +// systemd +#[path = "**/*.automount"] +#[path = "**/*.dnssd"] +#[path = "**/*.link"] +#[path = "**/*.mount"] +#[path = "**/*.netdev"] +#[path = "**/*.network"] +#[path = "**/*.nspawn"] +#[path = "**/*.path"] +#[path = "**/*.service"] +#[path = "**/*.slice"] +#[path = "**/*.socket"] +#[path = "**/*.swap"] +#[path = "**/*.systemd"] +#[path = "**/*.target"] +#[path = "**/*.timer"] +pub fn properties() { + if /\[[^\[]+\]/ { + yield meta.header; + } + + until /$/ { + yield other; + + if /[;#].*/ { + yield comment; + } else if /([\w.-]+)\s*=\s*[\w.-]*/ { + yield $1 as variable; + yield other; + } else if /'/ { + single_quote_string(); + } else if /"/ { + double_quote_string(); + } + + yield other; + } +} diff --git a/crates/lsh/definitions/utility.lsh b/crates/lsh/definitions/utility.lsh new file mode 100644 index 000000000000..d8f6be7ed76d --- /dev/null +++ b/crates/lsh/definitions/utility.lsh @@ -0,0 +1,21 @@ +fn single_quote_string() { + until /$/ { + if /\\./ { + // Escape sequences + } else if /'/ { + yield string; + break; + } + } +} + +fn double_quote_string() { + until /$/ { + if /\\./ { + // Escape sequences + } else if /"/ { + yield string; + break; + } + } +} diff --git a/crates/lsh/definitions/xml.lsh b/crates/lsh/definitions/xml.lsh new file mode 100644 index 000000000000..988273c91654 --- /dev/null +++ b/crates/lsh/definitions/xml.lsh @@ -0,0 +1,78 @@ +#[display_name = "XML"] +#[path = "**/*.csproj.user"] +#[path = "**/*.csproj"] +#[path = "**/*.html"] +#[path = "**/*.nuspec"] +#[path = "**/*.proj"] +#[path = "**/*.props"] +#[path = "**/*.rss"] +#[path = "**/*.slnx"] +#[path = "**/*.svg"] +#[path = "**/*.targets"] +#[path = "**/*.vcxproj.filters"] +#[path = "**/*.vcxproj"] +#[path = "**/*.xaml"] +#[path = "**/*.xml"] +#[path = "**/*.xml"] +pub fn xml() { + until /$/ { + yield other; + + if // { + yield comment; + break; + } + } + } else if // { + yield string; + break; + } + } + } else if /]*>/ { + yield $1 as constant.language; + yield other; + } else if /(?:<\?|<)([\w:.-]+)/ { + yield $1 as constant.language; + yield other; + + until />/ { + yield other; + await input; + + if /([\w:.-]+)\s*=/ { + yield $1 as variable; + yield other; + } else if /"/ { + until /"/ { + yield string; + await input; + } + yield string; + } else if /'/ { + until /'/ { + yield string; + await input; + } + yield string; + } + } + + yield other; + } else if /<\/([\w:.-]+)\s*>/ { + yield $1 as constant.language; + yield other; + } else if /(?:&#|&)[\w:.-]+;/ { + yield constant.numeric; + } + + yield other; + } +} diff --git a/crates/lsh/definitions/yaml.lsh b/crates/lsh/definitions/yaml.lsh new file mode 100644 index 000000000000..a090447e3dd5 --- /dev/null +++ b/crates/lsh/definitions/yaml.lsh @@ -0,0 +1,78 @@ +#[display_name = "YAML"] +#[path = "**/*.yaml"] +#[path = "**/*.yml"] +pub fn yaml() { + if /\s+/ { + // Leading whitespace + } + + var indentation = off; + + loop { + if /-\s+/ { + // List item + } + if /\w+:(?:\s+|$)/ { + // Key: (Value follows) + } + + if /[|>][-+]?.*/ { + // If we see a multiline string inducer, capture content until indentation is back to before. + // Since we'll skip multiple lines, we need to use nested loops here. + + loop { + // Wait for the next line in this multiline string. + await input; + + // Skip leading whitespace + if /\s+/ { + } + + if off <= indentation { + // Once the new indentation is smaller, the multiline string has ended. + break; + } + + if /.*/ { + yield string; + } + } + } else { + break; + } + } + + yield other; + + if /---/ { + yield other; + } else if /true|false|yes|no|null|~/ { + // If it looks like a keyword, check if it's actually a string (e.g. "foo: true false"). + // TODO: Ideally this would not consume the last chunk of /\s+/ before a /# comment/. + if /\s*[^\s#]+[^#]*/ { + yield string; + } else { + yield constant.language; + } + } else if /-?(?:\d+\.?\d*|\.\d+)(?:[eE][+-]?\d+)?/ { + // Same as with keywords above. + if /\s*[^\s#]+[^#]*/ { + yield string; + } else { + yield constant.numeric; + } + } else if /[^\s#]+[^#]*/ { + // Same as before: Gobble anything that looks like a string. + // TODO: This could be simplified & combined with the above if we stored the + // highlight kind into a variable and then yield the variable value itself. + yield string; + } + + if /\s+/ { + yield other; + } + + if /#.*/ { + yield comment; + } +} From 6d52672ddc44d65e8532f87ce875eeeb275f93e3 Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Thu, 26 Mar 2026 15:25:37 +0100 Subject: [PATCH 02/11] Fix Charset::set_range --- crates/lsh/src/compiler/charset.rs | 32 +++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/crates/lsh/src/compiler/charset.rs b/crates/lsh/src/compiler/charset.rs index 32d1e99653ee..ca772904b8b9 100644 --- a/crates/lsh/src/compiler/charset.rs +++ b/crates/lsh/src/compiler/charset.rs @@ -62,6 +62,11 @@ impl Charset { let end_word = end / WORD_BITS; let end_bit = end % WORD_BITS; + // Save the bits above end_bit in the last word before we overwrite it + let shift = end_bit + 1; + let tail_mask = if shift < WORD_BITS { usize::MAX << shift } else { 0 }; + let saved_tail = self.bits[end_word] & tail_mask; + // Write the starting bits of the first word let mask = usize::MAX << start_bit; self.bits[start_word] = @@ -74,11 +79,8 @@ impl Charset { word += 1; } - // Fix the trailing bits of the last word we wrote above - let shift = end_bit + 1; - let mask = if shift < WORD_BITS { usize::MAX << shift } else { 0 }; - self.bits[end_word] = - if value { self.bits[end_word] & !mask } else { self.bits[end_word] | mask }; + // Restore the trailing bits of the last word + self.bits[end_word] = (self.bits[end_word] & !tail_mask) | saved_tail; } pub fn merge(&mut self, other: &Charset) { @@ -198,4 +200,24 @@ mod tests { } } } + + #[test] + fn set_range_overlapping() { + // Two single-bit ranges in the same word must not clobber each other. + let mut cs = Charset::no(); + cs.set_range(b'e'..=b'e', true); + cs.set_range(b'E'..=b'E', true); + for i in 0u8..=255 { + assert_eq!(cs.get(i), i == b'e' || i == b'E', "bit {i}"); + } + + // A wide range must not destroy bits set earlier outside it. + let mut cs = Charset::no(); + cs.set_range(0..=0, true); + cs.set_range(255..=255, true); + cs.set_range(10..=245, true); + for i in 0u8..=255 { + assert_eq!(cs.get(i), i == 0 || (10..=245).contains(&i) || i == 255, "bit {i}"); + } + } } From 7474c099bd1222b916c52237f76cc876e94426f1 Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Thu, 26 Mar 2026 15:25:52 +0100 Subject: [PATCH 03/11] Clean up atom/literal parsing, Add \t support --- crates/lsh/src/compiler/regex.rs | 281 +++++++++++++++---------------- 1 file changed, 132 insertions(+), 149 deletions(-) diff --git a/crates/lsh/src/compiler/regex.rs b/crates/lsh/src/compiler/regex.rs index 1862ce929706..e4d3c4696253 100644 --- a/crates/lsh/src/compiler/regex.rs +++ b/crates/lsh/src/compiler/regex.rs @@ -109,6 +109,14 @@ enum Regex { Dot, } +enum Atom { + Empty, + Meta(char), + Char(char), + WordEnd, + Class(Charset), +} + struct RegexParser<'a> { input: &'a str, pos: usize, @@ -132,8 +140,12 @@ impl<'a> RegexParser<'a> { Ok(result) } + fn rest(&self) -> &'a str { + &self.input[self.pos..] + } + fn peek(&self) -> Option { - self.input[self.pos..].chars().next() + self.rest().chars().next() } fn advance(&mut self) -> Option { @@ -186,7 +198,7 @@ impl<'a> RegexParser<'a> { /// a?, a*, a+, a{n,m} fn parse_quantified(&mut self) -> Result { - let base = self.parse_atom()?; + let base = self.parse_primary()?; let (min, max) = match self.peek() { Some('?') => { @@ -237,7 +249,7 @@ impl<'a> RegexParser<'a> { } /// Parse a single atom (literal, class, group, anchor) - fn parse_atom(&mut self) -> Result { + fn parse_primary(&mut self) -> Result { match self.peek() { None => Ok(Regex::Empty), Some('(') => self.parse_group(), @@ -260,67 +272,38 @@ impl<'a> RegexParser<'a> { /// metacharacters into a single literal. For example, `\+\+\+` becomes `Literal("+++")`. fn parse_literal(&mut self) -> Result { let mut lit = String::new(); + let mut prev_atom_lit_len = 0; + let mut prev_atom_pos = self.pos; loop { - match self.peek() { - Some('\\') => { - let escape_char = self.input[self.pos + 1..].chars().next(); - match escape_char { - // Character classes - Some('w' | 'W' | 'd' | 'D' | 's' | 'S') => { - if lit.is_empty() { - // Start with the class - self.advance(); // consume '\' - return self.parse_escape_as_regex(); - } else { - // Return accumulated literal, leave escape for next parse - break; - } - } - // Word boundary - Some('>') => { - if lit.is_empty() { - self.advance(); // consume '\' - self.advance(); // consume '>' - return Ok(Regex::WordEnd); - } else { - break; - } - } - // Simple escape - Some(c) if !c.is_ascii_alphanumeric() => { - // Check if this char would be quantified - let after_escape = self.pos + 1 + c.len_utf8(); - if after_escape < self.input.len() && !lit.is_empty() { - let next = self.input[after_escape..].chars().next(); - if matches!(next, Some('?' | '*' | '+' | '{')) { - break; - } - } - self.advance(); // consume '\' - self.advance(); // consume escaped char - lit.push(c); - } - Some(c) => { - return Err(format!("unknown escape sequence '\\{}'", c)); - } - None => { - return Err("unexpected end of pattern after backslash".to_string()); - } + let start = self.pos; + match self.parse_atom()? { + Atom::Meta('?' | '*' | '+' | '{') => { + // Quantifiers apply to the preceding atom, so we need to pop + // the last atom (= char / escape char) and stop parsing. + if prev_atom_lit_len == 0 { + self.pos = start; + } else { + lit.truncate(prev_atom_lit_len); + self.pos = prev_atom_pos; } + break; } - Some(c) if !is_meta_char(c) => { - let next_pos = self.pos + c.len_utf8(); - if next_pos < self.input.len() && !lit.is_empty() { - let next = self.input[next_pos..].chars().next(); - if matches!(next, Some('?' | '*' | '+' | '{')) { - break; - } - } + Atom::Char(c) => { + prev_atom_lit_len = lit.len(); + prev_atom_pos = start; lit.push(c); - self.advance(); } - _ => break, + Atom::WordEnd if lit.is_empty() => { + return Ok(Regex::WordEnd); + } + Atom::Class(cs) if lit.is_empty() => { + return Ok(Regex::CharClass(cs)); + } + _ => { + self.pos = start; + break; + } } } @@ -336,32 +319,6 @@ impl<'a> RegexParser<'a> { Ok(Regex::Literal(lit, self.case_insensitive)) } - /// \w, \d, etc. - fn parse_escape_as_regex(&mut self) -> Result { - match self.advance() { - Some('w') => Ok(Regex::CharClass(ASCII_WORD_CHARSET)), - Some('W') => { - let mut cs = ASCII_WORD_CHARSET; - cs.invert(); - Ok(Regex::CharClass(cs)) - } - Some('d') => Ok(Regex::CharClass(ASCII_DIGIT_CHARSET)), - Some('D') => { - let mut cs = ASCII_DIGIT_CHARSET; - cs.invert(); - Ok(Regex::CharClass(cs)) - } - Some('s') => Ok(Regex::CharClass(ASCII_WHITESPACE_CHARSET)), - Some('S') => { - let mut cs = ASCII_WHITESPACE_CHARSET; - cs.invert(); - Ok(Regex::CharClass(cs)) - } - Some(c) => Err(format!("unknown escape sequence '\\{}'", c)), - None => Err("unexpected end of pattern after backslash".to_string()), - } - } - /// (foo), (?:foo), (?i:foo) fn parse_group(&mut self) -> Result { self.expect('(')?; @@ -400,6 +357,19 @@ impl<'a> RegexParser<'a> { /// [a-z], [^a-z], etc. fn parse_char_class(&mut self) -> Result { + fn unexpected_end() -> Result { + Err("unexpected end of pattern in character class".to_string()) + } + fn unexpected_unicode(c: char) -> Result { + Err(format!("non-ASCII character '{c:?}' not supported in character class")) + } + fn unexpected_class() -> Result { + Err("cannot use character class in character class range".to_string()) + } + fn invalid_range(start: u8, end: u8) -> Result { + Err(format!("invalid character range {:?}-{:?}", start as char, end as char)) + } + self.expect('[')?; let negated = if self.peek() == Some('^') { @@ -419,47 +389,52 @@ impl<'a> RegexParser<'a> { self.advance(); } - while let Some(c) = self.peek() { - if c == ']' { - self.advance(); - break; - } - - if c == '\\' { - self.advance(); - let escaped = self.parse_escape_char()?; - match escaped { - EscapedChar::Char(b) => charset.set(b, true), - EscapedChar::Class(cs) => charset.merge(&cs), + loop { + match self.parse_atom()? { + Atom::Empty => return unexpected_end(), + Atom::Class(cs) => { + charset.merge(&cs); } - } else { - let start = c as u8; - self.advance(); + Atom::WordEnd => { + charset.set(b'>', true); + } + Atom::Meta(']') => break, + Atom::Meta(c) | Atom::Char(c) => { + if !c.is_ascii() { + return unexpected_unicode(c); + } - // Check for range - if self.peek() == Some('-') && !self.input[self.pos + 1..].starts_with(']') { - self.advance(); // consume - - let end = match self.peek() { - Some('\\') => { - self.advance(); - match self.parse_escape_char()? { - EscapedChar::Char(b) => b, - EscapedChar::Class(_) => { - return Err("cannot use character class in range".to_string()); + let start = c as u8; + let mut end = start; + + // Check for ranges, e.g. [a-z]. + // We exclude patterns like [a-], because this implicitly sets 'a'..='a' in this iteration, + // and then '-'..='-' in the next iteration, which is the exact behavior we need. + if let rest = self.rest() + && rest.starts_with("-") + && !rest.starts_with("-]") + { + self.advance(); // consume - + + match self.parse_atom()? { + Atom::Empty => return unexpected_end(), + Atom::Class(_) => return unexpected_class(), + Atom::WordEnd => { + end = b'>'; + } + Atom::Meta(c) | Atom::Char(c) => { + if !c.is_ascii() { + return unexpected_unicode(c); } + end = c as u8; } } - Some(c) => { - self.advance(); - c as u8 - } - None => { - return Err("unexpected end of pattern in character class".to_string()); - } - }; + } + + if start > end { + return invalid_range(start, end); + } charset.set_range(start..=end, true); - } else { - charset.set(start, true); } } } @@ -471,42 +446,50 @@ impl<'a> RegexParser<'a> { Ok(Regex::CharClass(charset)) } - fn parse_escape_char(&mut self) -> Result { - match self.advance() { - Some('w') => Ok(EscapedChar::Class(ASCII_WORD_CHARSET)), - Some('W') => { - let mut cs = ASCII_WORD_CHARSET; - cs.invert(); - Ok(EscapedChar::Class(cs)) - } - Some('d') => Ok(EscapedChar::Class(ASCII_DIGIT_CHARSET)), - Some('D') => { - let mut cs = ASCII_DIGIT_CHARSET; - cs.invert(); - Ok(EscapedChar::Class(cs)) + fn parse_atom(&mut self) -> Result { + let Some(c) = self.advance() else { + return Ok(Atom::Empty); + }; + + match c { + '(' | ')' | '[' | ']' | '{' | '}' | '|' | '?' | '*' | '+' | '.' | '^' | '$' => { + Ok(Atom::Meta(c)) } - Some('s') => Ok(EscapedChar::Class(ASCII_WHITESPACE_CHARSET)), - Some('S') => { - let mut cs = ASCII_WHITESPACE_CHARSET; - cs.invert(); - Ok(EscapedChar::Class(cs)) + '\\' => { + let Some(ce) = self.advance() else { + return Err("unexpected end of pattern after backslash".to_string()); + }; + + match ce { + '>' => Ok(Atom::WordEnd), + 'w' => Ok(Atom::Class(ASCII_WORD_CHARSET)), + 'W' => { + let mut cs = ASCII_WORD_CHARSET; + cs.invert(); + Ok(Atom::Class(cs)) + } + 'd' => Ok(Atom::Class(ASCII_DIGIT_CHARSET)), + 'D' => { + let mut cs = ASCII_DIGIT_CHARSET; + cs.invert(); + Ok(Atom::Class(cs)) + } + 's' => Ok(Atom::Class(ASCII_WHITESPACE_CHARSET)), + 'S' => { + let mut cs = ASCII_WHITESPACE_CHARSET; + cs.invert(); + Ok(Atom::Class(cs)) + } + 't' => Ok(Atom::Char('\t')), + c if !c.is_ascii_alphanumeric() => Ok(Atom::Char(c)), + c => Err(format!("unknown escape sequence '\\{c}'")), + } } - Some(c) if !c.is_ascii_alphanumeric() => Ok(EscapedChar::Char(c as u8)), - Some(c) => Err(format!("unknown escape sequence '\\{}'", c)), - None => Err("unexpected end of pattern after backslash".to_string()), + _ => Ok(Atom::Char(c)), } } } -enum EscapedChar { - Char(u8), - Class(Charset), -} - -fn is_meta_char(c: char) -> bool { - matches!(c, '(' | ')' | '[' | ']' | '{' | '}' | '|' | '?' | '*' | '+' | '.' | '^' | '$') -} - struct CodeGen<'a, 'c> { compiler: &'c mut Compiler<'a>, captures: CaptureList<'a>, From 2b8ec8c93056465e08930b29037258ab9b6b8364 Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Thu, 26 Mar 2026 15:26:41 +0100 Subject: [PATCH 04/11] Detect indented code blocks in Markdown --- crates/lsh/definitions/markdown.lsh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/crates/lsh/definitions/markdown.lsh b/crates/lsh/definitions/markdown.lsh index f5444d32f94d..3fcc7721a157 100644 --- a/crates/lsh/definitions/markdown.lsh +++ b/crates/lsh/definitions/markdown.lsh @@ -1,6 +1,12 @@ #[display_name = "Markdown"] #[path = "**/*.md"] pub fn markdown() { + // Any lines that start with a tab or 4+ spaces are code blocks. + if /(?:\t| ).*/ { + yield other; + return; + } + // Gobble any leading whitespace on the line. if /\s+/ { yield other; From 76b58eb944521bdffda9de9c26df70ced781e44f Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Fri, 27 Mar 2026 16:22:14 +0100 Subject: [PATCH 05/11] Remove unneeded rust-toolchain.toml file --- rust-toolchain.toml | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 rust-toolchain.toml diff --git a/rust-toolchain.toml b/rust-toolchain.toml deleted file mode 100644 index 5d56faf9ae08..000000000000 --- a/rust-toolchain.toml +++ /dev/null @@ -1,2 +0,0 @@ -[toolchain] -channel = "nightly" From c51b006e49ff2c13777d57fbe8dfb4267e8d30b0 Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Fri, 27 Mar 2026 16:22:39 +0100 Subject: [PATCH 06/11] Fix fast skip loop calculation --- crates/lsh/src/compiler/mod.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/crates/lsh/src/compiler/mod.rs b/crates/lsh/src/compiler/mod.rs index bbeb2814365b..d6b372387358 100644 --- a/crates/lsh/src/compiler/mod.rs +++ b/crates/lsh/src/compiler/mod.rs @@ -21,6 +21,7 @@ use std::path::Path; use stdext::arena::Arena; use stdext::collections::BString; +use stdext::opt_ptr_eq; pub use self::charset::{Charset, SerializedCharset}; use self::frontend::*; @@ -163,7 +164,8 @@ impl<'a> Compiler<'a> { break; }; - if let IRI::If { condition, then } = node.borrow().instr { + let node = node.borrow(); + if let IRI::If { condition, then } = node.instr { // For the purpose of computing fast-skips the contents of if conditions are irrelevant, // so skip the subtree. This is actually quite important. This this as an example: // loop { @@ -176,7 +178,19 @@ impl<'a> Compiler<'a> { // } // The inverted charset of the inner /b/ includes "a". If we merge that into the outer // loop's charset we get one that covers all characters, making fast-skips impossible. - iter.skip_node(then); + // --> Skip the "then" subtree. + // + // HOWEVER, imagine a condition like this: + // if /a?b/ {} + // This compiles to something like: + // if "a" + // .then -> if "b" {} + // .else -> if "b" {} (aka: .next) + // In other words, "then" and "next" point to the same thing. + // --> Only skip "then" if it's not the same as "next". + if !opt_ptr_eq(Some(then), node.next) { + iter.skip_node(then); + } match condition { Condition::Cmp { .. } => {} From 81a2b4d63351bd46e5661e61ca1638f49a5f6771 Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Fri, 27 Mar 2026 16:22:56 +0100 Subject: [PATCH 07/11] Optimize any-char (.) expressions --- crates/lsh/src/compiler/regex.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/crates/lsh/src/compiler/regex.rs b/crates/lsh/src/compiler/regex.rs index e4d3c4696253..f32c475c4600 100644 --- a/crates/lsh/src/compiler/regex.rs +++ b/crates/lsh/src/compiler/regex.rs @@ -547,12 +547,10 @@ impl<'a, 'c> CodeGen<'a, 'c> { } Regex::Dot => { - let cs = Charset::yes(); - let cs = self.compiler.intern_charset(&cs); - let condition = Condition::Charset { cs, min: 1, max: 1 }; - let if_node = self.compiler.alloc_iri(IRI::If { condition, then: on_match }); - if_node.borrow_mut().next = Some(on_fail); - Ok(if_node) + let dst = self.compiler.get_reg(Register::InputOffset); + let node = self.compiler.alloc_iri(IRI::AddImm { dst, imm: 1 }); + node.borrow_mut().next = Some(on_match); + Ok(node) } Regex::EndOfLine => { From ca8e7b260f29a57645f4649de827f2fd32ed33c8 Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Fri, 27 Mar 2026 18:25:16 +0100 Subject: [PATCH 08/11] Optimize optional a? expressions --- crates/lsh/src/compiler/regex.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/crates/lsh/src/compiler/regex.rs b/crates/lsh/src/compiler/regex.rs index f32c475c4600..e0041ea5928f 100644 --- a/crates/lsh/src/compiler/regex.rs +++ b/crates/lsh/src/compiler/regex.rs @@ -660,10 +660,18 @@ impl<'a, 'c> CodeGen<'a, 'c> { return self.emit_charset(cs, min, max, on_match, on_fail); } - // Single-char literal like `#+`: convert to charset for efficient handling. + // Single-char literal like `#+` if let Regex::Literal(ref s, case_insensitive) = *inner && s.len() == 1 { + // Optional single char: `a?` + // It can be trivially translated to a Prefix/PrefixInsensitive check + // where even on_fail is a success and is thus connected to on_match. + if min == 0 && max == 1 { + return self.emit(inner, on_match, on_match); + } + + // Otherwise, we must translate to a Charset match. let b = s.as_bytes()[0]; let mut cs = Charset::no(); if case_insensitive { From 8fe1871fba670a68efd0aac06c7a162daf1dee2b Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Fri, 27 Mar 2026 18:26:21 +0100 Subject: [PATCH 09/11] Optimize dual charsets [-+] --- crates/lsh/src/compiler/charset.rs | 11 ++++ crates/lsh/src/compiler/regex.rs | 81 +++++++++++++++++++++--------- 2 files changed, 67 insertions(+), 25 deletions(-) diff --git a/crates/lsh/src/compiler/charset.rs b/crates/lsh/src/compiler/charset.rs index ca772904b8b9..e7e301e3253d 100644 --- a/crates/lsh/src/compiler/charset.rs +++ b/crates/lsh/src/compiler/charset.rs @@ -35,6 +35,17 @@ impl Charset { (self.bits[hi] & (1 << lo)) != 0 } + pub fn get_and_reset_lowest(&mut self) -> Option { + for (hi, bits) in self.bits.iter_mut().enumerate() { + if *bits != 0 { + let lo = bits.trailing_zeros() as usize; + *bits &= !(1 << lo); + return Some((hi * WORD_BITS + lo) as u8); + } + } + None + } + pub fn covers_none(&self) -> bool { self.bits.iter().all(|&b| b == usize::MIN) } diff --git a/crates/lsh/src/compiler/regex.rs b/crates/lsh/src/compiler/regex.rs index e0041ea5928f..e2c6cd11f77c 100644 --- a/crates/lsh/src/compiler/regex.rs +++ b/crates/lsh/src/compiler/regex.rs @@ -43,6 +43,8 @@ //! - The `parse()` function wires its generated IR into the provided destination nodes. //! Don't pass nodes that are already part of the IR graph. +use std::slice; + use stdext::collections::BVec; use super::*; @@ -524,27 +526,10 @@ impl<'a, 'c> CodeGen<'a, 'c> { Regex::Empty => Ok(on_match), Regex::Literal(s, case_insensitive) => { - if s.is_empty() { - return Ok(on_match); - } - let s = self.compiler.intern_string(s); - let condition = if *case_insensitive { - Condition::PrefixInsensitive(s) - } else { - Condition::Prefix(s) - }; - let if_node = self.compiler.alloc_iri(IRI::If { condition, then: on_match }); - if_node.borrow_mut().next = Some(on_fail); - Ok(if_node) + self.emit_literal(s, *case_insensitive, on_match, on_fail) } - Regex::CharClass(cs) => { - let cs = self.compiler.intern_charset(cs); - let condition = Condition::Charset { cs, min: 1, max: u32::MAX }; - let if_node = self.compiler.alloc_iri(IRI::If { condition, then: on_match }); - if_node.borrow_mut().next = Some(on_fail); - Ok(if_node) - } + Regex::CharClass(cs) => self.emit_charset(cs, 1, 1, on_match, on_fail), Regex::Dot => { let dst = self.compiler.get_reg(Register::InputOffset); @@ -564,11 +549,7 @@ impl<'a, 'c> CodeGen<'a, 'c> { Regex::WordEnd => { // \> is a zero-width assertion: succeeds if NOT followed by a word char. // We invert the logic: check for word char, swap success/failure branches. - let cs = self.compiler.intern_charset(&ASCII_WORD_CHARSET); - let condition = Condition::Charset { cs, min: 1, max: 1 }; - let if_node = self.compiler.alloc_iri(IRI::If { condition, then: on_fail }); - if_node.borrow_mut().next = Some(on_match); - Ok(if_node) + self.emit_charset(&ASCII_WORD_CHARSET, 1, 1, on_fail, on_match) } Regex::Concat(parts) => { @@ -706,6 +687,24 @@ impl<'a, 'c> CodeGen<'a, 'c> { Ok(current) } + fn emit_literal( + &mut self, + s: &str, + case_insensitive: bool, + on_match: IRCell<'a>, + on_fail: IRCell<'a>, + ) -> Result, String> { + if s.is_empty() { + return Ok(on_match); + } + let s = self.compiler.intern_string(s); + let condition = + if case_insensitive { Condition::PrefixInsensitive(s) } else { Condition::Prefix(s) }; + let if_node = self.compiler.alloc_iri(IRI::If { condition, then: on_match }); + if_node.borrow_mut().next = Some(on_fail); + Ok(if_node) + } + fn emit_charset( &mut self, cs: &Charset, @@ -714,12 +713,44 @@ impl<'a, 'c> CodeGen<'a, 'c> { on_match: IRCell<'a>, on_fail: IRCell<'a>, ) -> Result, String> { + let mut next = if min == 0 { on_match } else { on_fail }; + + // If the expression is of form [a], [ab], [aA], or [aAbB] it is + // worth translating it to a Prefix/PrefixInsensitive check. + // The [a] and [aA] cases are an obvious improvement, but even the other + // two cases are worth it due to the shorter instruction encoding. + if max == 1 { + let mut cs = cs.clone(); + let mut chars = [(0u8, false); 2]; + let mut count = 0; + + for slot in &mut chars { + let Some(mut idx) = cs.get_and_reset_lowest() else { break }; + let case_insensitive = idx.is_ascii_uppercase() && cs.get(idx.to_ascii_lowercase()); + if case_insensitive { + idx = idx.to_ascii_lowercase(); + cs.set(idx, false); + } + *slot = (idx, case_insensitive); + count += 1; + } + + if count > 0 && cs.covers_none() { + for &(ch, insensitive) in chars[..count].iter().rev() { + let s = unsafe { str::from_utf8_unchecked(slice::from_ref(&ch)) }; + let node = self.emit_literal(s, insensitive, on_match, next)?; + next = node; + } + return Ok(next); + } + } + let cs = self.compiler.intern_charset(cs); let condition = Condition::Charset { cs, min, max }; let if_node = self.compiler.alloc_iri(IRI::If { condition, then: on_match }); // min=0 implies that it cannot fail. Remove `on_fail` to allow for later optimizations. - if_node.borrow_mut().next = Some(if min == 0 { on_match } else { on_fail }); + if_node.borrow_mut().next = Some(next); Ok(if_node) } From 07130a7e5e0fed489ccd9d1ce1b15a25602c1311 Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Fri, 27 Mar 2026 18:32:49 +0100 Subject: [PATCH 10/11] Inline call/ret instruction jumps --- crates/lsh/src/compiler/backend.rs | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/crates/lsh/src/compiler/backend.rs b/crates/lsh/src/compiler/backend.rs index cbe6f84c51cb..8a5016147c98 100644 --- a/crates/lsh/src/compiler/backend.rs +++ b/crates/lsh/src/compiler/backend.rs @@ -352,11 +352,29 @@ impl<'a> Backend<'a> { // If the next instruction was already serialized (e.g. this is some form of loop), // simply jump to the already serialized code. We're done here. Nothing new will come after this. + // + // If the destination is call/ret instruction we can just inline it. + // Otherwise, it'd be like jumping to a jump. + // + // TODO: If you think about it, this should kinda go into optimizer.rs, because it could + // do optimizations across entire instruction sequences (= it could do inlining!). + // But optimizer.rs doesn't have a linearized view of the assembly so it can't do this. if ir.offset != usize::MAX { - self.push_instruction(MovImm { - dst: Register::ProgramCounter, - imm: ir.offset as u32, - }); + match ir.instr { + IRI::Call { name } => { + let tgt = self.dst_by_name(name) as u32; + self.push_instruction(Call { tgt }); + } + IRI::Return => { + self.push_instruction(Return); + } + _ => { + self.push_instruction(MovImm { + dst: Register::ProgramCounter, + imm: ir.offset as u32, + }); + } + } break; } } From 8efe99472bcc93e25e3aacc093861b1652dfc3cc Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Fri, 27 Mar 2026 18:38:45 +0100 Subject: [PATCH 11/11] Silence, wench --- crates/edit/src/bin/edit/draw_menubar.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/edit/src/bin/edit/draw_menubar.rs b/crates/edit/src/bin/edit/draw_menubar.rs index 5d1acc2114a7..1ab702b64644 100644 --- a/crates/edit/src/bin/edit/draw_menubar.rs +++ b/crates/edit/src/bin/edit/draw_menubar.rs @@ -53,6 +53,7 @@ fn draw_menu_file(ctx: &mut Context, state: &mut State) { state.wants_file_picker = StateFilePicker::SaveAs; } } + #[allow(irrefutable_let_patterns)] if let path = Settings::borrow().path.as_path() && !path.as_os_str().is_empty() && ctx.menubar_menu_button(loc(LocId::FilePreferences), 'P', vk::NULL)