Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
525 changes: 514 additions & 11 deletions Cargo.lock

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions crates/code2prompt-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,16 @@ readme = "../../README.md"

[features]
default = []
# Entity-level code map via sem-core (functions/classes with line ranges and
# signatures). Optional because sem-core pulls in tree-sitter grammars for many
# languages; users who don't need the code map pay no build cost.
entity-map = ["dep:sem-core"]

[dependencies]
# Optional: only compiled with the `entity-map` feature. sem-core is offline and
# carries no telemetry (that lives in the sem CLI, not the library), so this keeps
# code2prompt fully air-gapped.
sem-core = { version = "0.13", optional = true }
anyhow = { workspace = true }
bracoxide = { workspace = true }
colored = { workspace = true }
Expand Down
8 changes: 8 additions & 0 deletions crates/code2prompt-core/src/configuration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,14 @@ pub struct Code2PromptConfig {
/// If true, symbolic links will be followed during traversal.
pub follow_symlinks: bool,

/// If true, extract an entity-level code map (functions, classes, ...) for
/// each file via sem-core, exposed to templates as `FileEntry.entities` and a
/// top-level `code_map`. Requires the `entity-map` build feature; without it
/// this flag has no effect.
///
/// Default: `false`
pub entity_map: bool,

/// Include hidden files and directories in processing.
///
/// Default: `false`
Expand Down
11 changes: 11 additions & 0 deletions crates/code2prompt-core/src/default_template_md.hbs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,17 @@ Source Tree:
{{ source_tree }}
```

{{#if code_map}}
Code Map:

{{#each code_map}}
`{{path}}`:
{{#each entities}}
- {{kind}} {{name}}{{#if signature}} `{{signature}}`{{/if}} (lines {{start_line}}-{{end_line}})
{{/each}}

{{/each}}
{{/if}}
{{#each files}}
{{#if code}}
`{{path}}`:
Expand Down
128 changes: 128 additions & 0 deletions crates/code2prompt-core/src/entity_map.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
//! Entity-level code map via [sem-core](https://github.com/Ataraxy-Labs/sem).
//!
//! When the `entity-map` feature is enabled, code2prompt extracts the structural
//! entities (functions, classes, methods, ...) from each source file using
//! sem-core's tree-sitter parsers. The result is exposed to templates both
//! per-file (`FileEntry.entities`) and as a top-level `code_map` aggregate, so a
//! prompt can include a compact outline of the codebase instead of, or alongside,
//! full file contents.
//!
//! sem-core is offline and emits no telemetry, so enabling this does not change
//! code2prompt's privacy posture.

use serde::{Deserialize, Serialize};

/// A single structural entity (function, class, method, ...) within a file.
///
/// This is a deliberately small projection of sem-core's internal entity type:
/// it carries only what a prompt template needs (name, kind, line range,
/// signature, parent), not source bodies or content hashes.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct EntitySummary {
/// Entity name, e.g. `process_single_file`.
pub name: String,
/// Entity kind as reported by sem-core, e.g. `function`, `class`, `method`.
pub kind: String,
/// 1-based first line of the entity.
pub start_line: usize,
/// 1-based last line of the entity.
pub end_line: usize,
/// First line of the entity's source (its signature/declaration), trimmed.
#[serde(skip_serializing_if = "Option::is_none")]
pub signature: Option<String>,
/// Name of the enclosing entity (e.g. the class a method belongs to), if any.
#[serde(skip_serializing_if = "Option::is_none")]
pub parent: Option<String>,
}

/// A file paired with its entity outline, used for the top-level `code_map`
/// template variable (an aggregate view alongside the per-file `entities`).
#[derive(Debug, Clone, Serialize)]
pub struct FileCodeMap {
pub path: String,
pub entities: Vec<EntitySummary>,
}

/// Extract the entity outline for one file's contents.
///
/// `file_path` is used by sem-core to pick the right language parser (by
/// extension). Returns an empty vector for files in languages sem-core does not
/// parse, so it is safe to call on every file.
#[cfg(feature = "entity-map")]
pub fn extract_entities(file_path: &str, content: &str) -> Vec<EntitySummary> {
use sem_core::parser::plugins::create_default_registry;
use sem_core::parser::registry::ParserRegistry;
use std::cell::RefCell;
use std::collections::HashMap;

// One registry per worker thread: building it registers every language
// plugin, so we amortize that across files rather than paying it per file,
// while staying thread-safe inside code2prompt's rayon file pipeline.
// NOTE: `ParserRegistry::new()` is empty; `create_default_registry()` is the
// populated one the sem CLI uses.
thread_local! {
static REGISTRY: RefCell<ParserRegistry> = RefCell::new(create_default_registry());
}

REGISTRY.with(|cell| {
let registry = cell.borrow();
let entities = registry.extract_entities(file_path, content);

// Resolve parent_id -> parent name so methods can show their class.
let name_by_id: HashMap<&str, &str> = entities
.iter()
.map(|e| (e.id.as_str(), e.name.as_str()))
.collect();

entities
.iter()
.map(|e| {
let signature = e
.content
.lines()
.next()
.map(|l| l.trim().to_string())
.filter(|s| !s.is_empty());
let parent = e
.parent_id
.as_deref()
.and_then(|pid| name_by_id.get(pid).map(|n| n.to_string()));
EntitySummary {
name: e.name.clone(),
kind: e.entity_type.clone(),
start_line: e.start_line,
end_line: e.end_line,
signature,
parent,
}
})
.collect()
})
}

/// No-op when the `entity-map` feature is disabled, so the rest of the codebase
/// compiles and runs identically without the sem-core dependency.
#[cfg(not(feature = "entity-map"))]
pub fn extract_entities(_file_path: &str, _content: &str) -> Vec<EntitySummary> {
Vec::new()
}

#[cfg(all(test, feature = "entity-map"))]
mod tests {
use super::*;

#[test]
fn extracts_rust_entities() {
let src = "pub struct Cache { size: usize }\n\nimpl Cache {\n pub fn new(size: usize) -> Self { Cache { size } }\n}\n\nfn helper(x: i32) -> i32 { x * 2 }\n";
let got = extract_entities("util.rs", src);
assert!(!got.is_empty(), "expected entities, got none: {got:?}");
assert!(got.iter().any(|e| e.name == "helper"));
}

#[test]
fn extracts_python_entities() {
let src = "class Calculator:\n def add(self, a, b):\n return a + b\n\ndef main():\n pass\n";
let got = extract_entities("math.py", src);
assert!(!got.is_empty(), "expected entities, got none: {got:?}");
}
}
1 change: 1 addition & 0 deletions crates/code2prompt-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
pub mod analysis;
pub mod builtin_templates;
pub mod configuration;
pub mod entity_map;
pub mod file_processor;
pub mod filter;
pub mod git;
Expand Down
15 changes: 15 additions & 0 deletions crates/code2prompt-core/src/path.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//! This module contains the functions for traversing the directory and processing the files.
use crate::configuration::Code2PromptConfig;
use crate::entity_map::{EntitySummary, extract_entities};
use crate::file_processor;
use crate::filter::{build_globset, should_include_file};
use crate::sort::{FileSortMethod, sort_files, sort_tree};
Expand Down Expand Up @@ -41,6 +42,11 @@ pub struct FileEntry {
pub metadata: EntryMetadata,
#[serde(skip_serializing_if = "Option::is_none")]
pub mod_time: Option<u64>,
/// Structural entities (functions, classes, ...) extracted from this file.
/// Empty unless the `entity-map` feature is enabled and `config.entity_map`
/// is set. Skipped from serialized output when empty.
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub entities: Vec<EntitySummary>,
}

/// Represents a file that needs to be processed
Expand Down Expand Up @@ -283,13 +289,22 @@ fn process_single_file(file_info: &FileToProcess, config: &Code2PromptConfig) ->

debug!(target: "included_files", "Included file: {}", file_path);

// Extract the entity outline from the unwrapped source. Cheap no-op unless
// the `entity-map` feature is on and the user enabled it.
let entities = if config.entity_map {
extract_entities(&file_path, &code)
} else {
Vec::new()
};

Some(FileEntry {
path: file_path,
extension: extension.to_string(),
code: code_block,
token_count,
metadata: EntryMetadata::from(metadata),
mod_time,
entities,
})
}

Expand Down
30 changes: 30 additions & 0 deletions crates/code2prompt-core/src/session.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use std::sync::Arc;

use crate::analysis::CodebaseAnalysis;
use crate::configuration::Code2PromptConfig;
use crate::entity_map::FileCodeMap;
use crate::git::{get_git_diff, get_git_diff_between_branches, get_git_log};
use crate::path::{FileEntry, display_name, traverse_directory, wrap_code_block};
use crate::selection::SelectionEngine;
Expand Down Expand Up @@ -91,6 +92,12 @@ pub struct TemplateContext<'a> {
#[serde(skip_serializing_if = "Option::is_none")]
pub git_log_branch: &'a Option<String>,

/// Top-level entity-level code map: one entry per file that has extracted
/// entities. Present only when the `entity-map` feature is enabled and the
/// user opted in. Per-file entities are also available via `files[].entities`.
#[serde(skip_serializing_if = "Option::is_none")]
pub code_map: Option<Vec<FileCodeMap>>,

#[serde(flatten)]
pub user_variables: &'a HashMap<String, String>,

Expand Down Expand Up @@ -286,6 +293,23 @@ impl Code2PromptSession {
Ok(())
}

/// Builds the top-level `code_map` aggregate from the loaded files, including
/// only files that have extracted entities. Returns `None` when no file has
/// any (e.g. entity extraction was disabled), so the template variable is
/// simply absent rather than empty.
fn build_code_map(&self) -> Option<Vec<FileCodeMap>> {
let files = self.data.files.as_deref()?;
let map: Vec<FileCodeMap> = files
.iter()
.filter(|f| !f.entities.is_empty())
.map(|f| FileCodeMap {
path: f.path.clone(),
entities: f.entities.clone(),
})
.collect();
(!map.is_empty()).then_some(map)
}

/// Constructs a zero-copy template context for rendering.
pub fn build_template_data(&self) -> TemplateContext<'_> {
TemplateContext {
Expand All @@ -295,6 +319,7 @@ impl Code2PromptSession {
git_diff: &self.data.git_diff,
git_diff_branch: &self.data.git_diff_branch,
git_log_branch: &self.data.git_log_branch,
code_map: self.build_code_map(),
user_variables: &self.config.user_variables,
no_codeblock: self.config.no_codeblock,
}
Expand Down Expand Up @@ -386,6 +411,7 @@ impl Code2PromptSession {
"token_count": token_count,
"model_info": model_info,
"files": files.clone(),
"code_map": self.build_code_map(),
});
serde_json::to_string_pretty(&json_data)?
}
Expand Down Expand Up @@ -461,6 +487,9 @@ impl Code2PromptSession {
token_count: 0, // Not used in skeleton
metadata: file.metadata,
mod_time: file.mod_time,
// Keep entities so the code map is counted in the
// structural token total.
entities: file.entities.clone(),
}
})
.collect()
Expand All @@ -474,6 +503,7 @@ impl Code2PromptSession {
git_diff: &self.data.git_diff,
git_diff_branch: &self.data.git_diff_branch,
git_log_branch: &self.data.git_log_branch,
code_map: self.build_code_map(),
user_variables: &self.config.user_variables,
no_codeblock: self.config.no_codeblock,
};
Expand Down
1 change: 1 addition & 0 deletions crates/code2prompt-core/tests/analysis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ mod tests {
is_symlink: false,
},
mod_time: None,
entities: vec![],
}
}

Expand Down
Loading