Skip to content

Commit 9f37463

Browse files
committed
WIP
1 parent b21c738 commit 9f37463

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+809
-12742
lines changed

misc/scripts/models-as-data/bulk_generate_mad.py

Lines changed: 50 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,15 @@ def missing_module(module_name: str) -> None:
4444
build_dir = pathlib.Path(gitroot, "mad-generation-build")
4545

4646

47+
def database_dir_for_project(name: str) -> pathlib.Path:
48+
return build_dir / f"{name}-db"
49+
50+
51+
def database_for_project_exists(name: str) -> pathlib.Path:
52+
path = database_dir_for_project(name)
53+
return path.exists()
54+
55+
4756
# A project to generate models for
4857
Project = TypedDict(
4958
"Project",
@@ -175,7 +184,7 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
175184

176185
def build_database(
177186
language: str, extractor_options, project: Project, project_dir: str
178-
) -> str | None:
187+
) -> bool:
179188
"""
180189
Build a CodeQL database for a project.
181190
@@ -186,12 +195,12 @@ def build_database(
186195
project_dir: Path to the CodeQL database.
187196
188197
Returns:
189-
The path to the created database directory.
198+
True if the build was successful, False otherwise.
190199
"""
191200
name = project["name"]
192201

193202
# Create database directory path
194-
database_dir = build_dir / f"{name}-db"
203+
database_dir = database_dir_for_project(name)
195204

196205
# Only build the database if it doesn't already exist
197206
if not database_dir.exists():
@@ -214,13 +223,13 @@ def build_database(
214223
print(f"Successfully created database at {database_dir}")
215224
except subprocess.CalledProcessError as e:
216225
print(f"Failed to create database for {name}: {e}")
217-
return None
226+
return False
218227
else:
219228
print(
220229
f"Skipping database creation for {name} as it already exists at {database_dir}"
221230
)
222231

223-
return database_dir
232+
return True
224233

225234

226235
def generate_models(config, args, project: Project, database_dir: str) -> None:
@@ -251,7 +260,7 @@ def generate_models(config, args, project: Project, database_dir: str) -> None:
251260

252261
def build_databases_from_projects(
253262
language: str, extractor_options, projects: List[Project]
254-
) -> List[tuple[Project, str | None]]:
263+
) -> List[tuple[Project, bool]]:
255264
"""
256265
Build databases for all projects in parallel.
257266
@@ -261,7 +270,7 @@ def build_databases_from_projects(
261270
projects: List of projects to build databases for.
262271
263272
Returns:
264-
List of (project_name, database_dir) pairs, where database_dir is None if the build failed.
273+
List of (project_name, success) pairs, where success is False if the build failed.
265274
"""
266275
# Clone projects in parallel
267276
print("=== Cloning projects ===")
@@ -332,20 +341,22 @@ def download_dca_databases(
332341
language: str,
333342
experiment_names: list[str],
334343
pat: str,
344+
reuse_databases: bool,
335345
projects: List[Project],
336-
) -> List[tuple[Project, str | None]]:
346+
) -> List[tuple[Project, bool]]:
337347
"""
338348
Download databases from a DCA experiment.
339349
Args:
340350
experiment_names: The names of the DCA experiments to download databases from.
341351
pat: Personal Access Token for GitHub API authentication.
342352
projects: List of projects to download databases for.
343353
Returns:
344-
List of (project_name, database_dir) pairs, where database_dir is None if the download failed.
354+
List of (project_name, success) pairs, where success is False if the download failed.
345355
"""
346356
print("\n=== Finding projects ===")
347357
project_map = {project["name"]: project for project in projects}
348-
analyzed_databases = {n: None for n in project_map}
358+
359+
analyzed_databases = {}
349360
for experiment_name in experiment_names:
350361
response = get_json_from_github(
351362
f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
@@ -358,11 +369,11 @@ def download_dca_databases(
358369
artifact_name = analyzed_database["artifact_name"]
359370
pretty_name = pretty_name_from_artifact_name(artifact_name)
360371

361-
if not pretty_name in analyzed_databases:
372+
if not pretty_name in project_map:
362373
print(f"Skipping {pretty_name} as it is not in the list of projects")
363374
continue
364375

365-
if analyzed_databases[pretty_name] is not None:
376+
if pretty_name in analyzed_databases:
366377
print(
367378
f"Skipping previous database {analyzed_databases[pretty_name]['artifact_name']} for {pretty_name}"
368379
)
@@ -376,8 +387,9 @@ def download_dca_databases(
376387
)
377388
sys.exit(1)
378389

379-
def download_and_decompress(analyzed_database: dict) -> str:
390+
def download_and_decompress(analyzed_database: dict) -> bool:
380391
artifact_name = analyzed_database["artifact_name"]
392+
pretty_name = pretty_name_from_artifact_name(artifact_name)
381393
repository = analyzed_database["repository"]
382394
run_id = analyzed_database["run_id"]
383395
print(f"=== Finding artifact: {artifact_name} ===")
@@ -407,15 +419,18 @@ def download_and_decompress(analyzed_database: dict) -> str:
407419
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
408420
# And we just untar it to the same directory as the zip file
409421
tar_ref.extractall(artifact_unzipped_location)
410-
ret = artifact_unzipped_location / language
411-
print(f"Decompression complete: {ret}")
412-
return ret
422+
database_location = database_dir_for_project(pretty_name)
423+
# Move the database to the canonical location
424+
shutil.move(artifact_unzipped_location / language, database_location)
425+
426+
print(f"Decompression complete: {database_location}")
427+
return True
413428

414429
results = run_in_parallel(
415430
download_and_decompress,
416431
list(analyzed_databases.values()),
417432
on_error=lambda db, exc: print(
418-
f"ERROR: Failed to download and decompress {db["artifact_name"]}: {exc}"
433+
f"ERROR: Failed to download and decompress {db['artifact_name']}: {exc}"
419434
),
420435
error_summary=lambda failures: print(
421436
f"ERROR: Failed to download {len(failures)} databases: {', '.join(item[0] for item in failures)}"
@@ -460,6 +475,13 @@ def main(config, args) -> None:
460475
# Create build directory if it doesn't exist
461476
build_dir.mkdir(parents=True, exist_ok=True)
462477

478+
# Check if reusing databases is given and all databases exist
479+
reuse_databases = args.reuse_databases
480+
all_databases_exist = reuse_databases and
481+
all_exist = all(
482+
database_for_project_exists(project["name"]) for project in projects
483+
)
484+
463485
database_results = []
464486
match get_strategy(config):
465487
case "repo":
@@ -487,14 +509,15 @@ def main(config, args) -> None:
487509
language,
488510
experiment_names,
489511
pat,
512+
args.reuse_databases,
490513
projects,
491514
)
492515

493516
# Generate models for all projects
494517
print("\n=== Generating models ===")
495518

496519
failed_builds = [
497-
project["name"] for project, db_dir in database_results if db_dir is None
520+
project["name"] for project, success in database_results if not success
498521
]
499522
if failed_builds:
500523
print(
@@ -506,8 +529,9 @@ def main(config, args) -> None:
506529
for project, _ in database_results:
507530
clean_up_mad_destination_for_project(config, project["name"])
508531

509-
for project, database_dir in database_results:
510-
if database_dir is not None:
532+
for project, success in database_results:
533+
database_dir = database_dir_for_project(project["name"])
534+
if success:
511535
generate_models(config, args, project, database_dir)
512536

513537

@@ -543,6 +567,12 @@ def main(config, args) -> None:
543567
help="What `--threads` value to pass to `codeql` (default %(default)s)",
544568
default=0,
545569
)
570+
parser.add_argument(
571+
"--reuse-databases",
572+
type=bool,
573+
help="Whether to reuse existing databases instead of rebuilding them",
574+
default=False,
575+
)
546576
args = parser.parse_args()
547577

548578
# Load config file

rust/bulk_generation_targets.yml

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,16 @@ single-file: true # dump models into a single file per crate (we do not have pr
1010
# if a target has a dependency in this same list, it should be listed after that dependency
1111
targets:
1212
- name: rust
13-
- name: libc
14-
- name: log
15-
- name: memchr
16-
- name: once_cell
17-
- name: rand
18-
- name: smallvec
19-
- name: serde
20-
- name: tokio
21-
- name: reqwest
22-
- name: rocket
23-
- name: actix-web
24-
- name: hyper
25-
- name: clap
13+
# - name: libc
14+
# - name: log
15+
# - name: memchr
16+
# - name: once_cell
17+
# - name: rand
18+
# - name: smallvec
19+
# - name: serde
20+
# - name: tokio
21+
# - name: reqwest
22+
# - name: rocket
23+
# - name: actix-web
24+
# - name: hyper
25+
# - name: clap

rust/ql/lib/codeql/rust/dataflow/internal/DataFlowImpl.qll

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,11 @@ predicate indexAssignment(
286286
not index.getResolvedTarget().fromSource()
287287
}
288288

289-
module RustDataFlow implements InputSig<Location> {
289+
signature module RustDataFlowInputSig {
290+
predicate includeDynamicTargets();
291+
}
292+
293+
module RustDataFlowGen<RustDataFlowInputSig Input> implements InputSig<Location> {
290294
private import Aliases
291295
private import codeql.rust.dataflow.DataFlow
292296
private import Node as Node
@@ -441,8 +445,12 @@ module RustDataFlow implements InputSig<Location> {
441445
/** Gets a viable implementation of the target of the given `Call`. */
442446
DataFlowCallable viableCallable(DataFlowCall call) {
443447
exists(Call c | c = call.asCall() |
448+
Input::includeDynamicTargets() and
444449
result.asCfgScope() = c.getARuntimeTarget()
445450
or
451+
not Input::includeDynamicTargets() and
452+
result.asCfgScope() = c.getStaticTarget()
453+
or
446454
exists(SummarizedCallable sc, Function staticTarget |
447455
staticTarget = getStaticTargetExt(c) and
448456
sc = result.asSummarizedCallable() and
@@ -908,6 +916,12 @@ module RustDataFlow implements InputSig<Location> {
908916
class DataFlowSecondLevelScope = Void;
909917
}
910918

919+
module RustDataFlowInput implements RustDataFlowInputSig {
920+
predicate includeDynamicTargets() { any() }
921+
}
922+
923+
module RustDataFlow = RustDataFlowGen<RustDataFlowInput>;
924+
911925
/** Provides logic related to captured variables. */
912926
module VariableCapture {
913927
private import codeql.rust.internal.CachedStages
@@ -1079,7 +1093,7 @@ private module Cached {
10791093
}
10801094

10811095
cached
1082-
newtype TParameterPosition =
1096+
newtype TParameterPositioni =
10831097
TPositionalParameterPosition(int i) {
10841098
i in [0 .. max([any(ParamList l).getNumberOfParams(), any(ArgList l).getNumberOfArgs()]) - 1]
10851099
or
@@ -1090,6 +1104,8 @@ private module Cached {
10901104
TClosureSelfParameterPosition() or
10911105
TSelfParameterPosition()
10921106

1107+
final class TParameterPosition = TParameterPositioni;
1108+
10931109
cached
10941110
newtype TReturnKind = TNormalReturnKind()
10951111

rust/ql/lib/codeql/rust/dataflow/internal/TaintTrackingImpl.qll

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
private import rust
2+
private import codeql.dataflow.DataFlow as DF
23
private import codeql.dataflow.TaintTracking
3-
private import codeql.rust.dataflow.DataFlow
4+
private import codeql.rust.dataflow.DataFlow as RustDataFlow
45
private import codeql.rust.dataflow.FlowSummary
5-
private import DataFlowImpl
6+
private import DataFlowImpl as DataFlowImpl
67
private import Node as Node
78
private import Content
89
private import FlowSummaryImpl as FlowSummaryImpl
@@ -29,15 +30,19 @@ private predicate excludedTaintStepContent(Content c) {
2930
)
3031
}
3132

32-
module RustTaintTracking implements InputSig<Location, RustDataFlow> {
33-
predicate defaultTaintSanitizer(DataFlow::Node node) { none() }
33+
module RustTaintTrackingGen<DataFlowImpl::RustDataFlowInputSig I> implements
34+
InputSig<Location, DataFlowImpl::RustDataFlowGen<I>>
35+
{
36+
private module TheDataFlow = DataFlowImpl::RustDataFlowGen<I>;
37+
38+
predicate defaultTaintSanitizer(TheDataFlow::Node node) { none() }
3439

3540
/**
3641
* Holds if the additional step from `pred` to `succ` should be included in all
3742
* global taint flow configurations.
3843
*/
3944
cached
40-
predicate defaultAdditionalTaintStep(DataFlow::Node pred, DataFlow::Node succ, string model) {
45+
predicate defaultAdditionalTaintStep(TheDataFlow::Node pred, TheDataFlow::Node succ, string model) {
4146
Stages::DataFlowStage::ref() and
4247
model = "" and
4348
(
@@ -53,14 +58,14 @@ module RustTaintTracking implements InputSig<Location, RustDataFlow> {
5358
// is tainted and an operation reads from `foo` (e.g., `foo.bar`) then
5459
// taint is propagated.
5560
exists(Content c |
56-
RustDataFlow::readContentStep(pred, c, succ) and
61+
TheDataFlow::readContentStep(pred, c, succ) and
5762
not excludedTaintStepContent(c)
5863
)
5964
or
6065
// In addition to the above, for element and reference content we let
6166
// _all_ read steps (including those from flow summaries and those that
6267
// result in small primitive types) give rise to taint steps.
63-
exists(SingletonContentSet cs | RustDataFlow::readStep(pred, cs, succ) |
68+
exists(SingletonContentSet cs | TheDataFlow::readStep(pred, cs, succ) |
6469
cs.getContent() instanceof ElementContent
6570
or
6671
cs.getContent() instanceof ReferenceContent
@@ -79,9 +84,11 @@ module RustTaintTracking implements InputSig<Location, RustDataFlow> {
7984
)
8085
or
8186
succ.(Node::PostUpdateNode).getPreUpdateNode().asExpr() =
82-
getPostUpdateReverseStep(pred.(Node::PostUpdateNode).getPreUpdateNode().asExpr(), false)
87+
DataFlowImpl::getPostUpdateReverseStep(pred.(Node::PostUpdateNode)
88+
.getPreUpdateNode()
89+
.asExpr(), false)
8390
or
84-
indexAssignment(any(CompoundAssignmentExpr cae),
91+
DataFlowImpl::indexAssignment(any(CompoundAssignmentExpr cae),
8592
pred.(Node::PostUpdateNode).getPreUpdateNode().asExpr(), _, succ, _)
8693
)
8794
or
@@ -94,19 +101,22 @@ module RustTaintTracking implements InputSig<Location, RustDataFlow> {
94101
* and inputs to additional taint steps.
95102
*/
96103
bindingset[node]
97-
predicate defaultImplicitTaintRead(DataFlow::Node node, ContentSet cs) {
104+
predicate defaultImplicitTaintRead(TheDataFlow::Node node, ContentSet cs) {
98105
exists(node) and
99106
exists(Content c | c = cs.(SingletonContentSet).getContent() |
100107
c instanceof ElementContent or
101108
c instanceof ReferenceContent
102-
) and
109+
) // and
103110
// Optional steps are added through isAdditionalFlowStep but we don't want the implicit reads
104-
not optionalStep(node, _, _)
111+
// FIXME:
112+
// not optionalStep(node, _, _)
105113
}
106114

107115
/**
108116
* Holds if the additional step from `src` to `sink` should be considered in
109117
* speculative taint flow exploration.
110118
*/
111-
predicate speculativeTaintStep(DataFlow::Node src, DataFlow::Node sink) { none() }
119+
predicate speculativeTaintStep(TheDataFlow::Node src, TheDataFlow::Node sink) { none() }
112120
}
121+
122+
module RustTaintTracking = RustTaintTrackingGen<DataFlowImpl::RustDataFlowInput>;

0 commit comments

Comments
 (0)