diff --git a/docs/snippets/multimodal.mdx b/docs/snippets/multimodal.mdx index 206cd50..334eec6 100644 --- a/docs/snippets/multimodal.mdx +++ b/docs/snippets/multimodal.mdx @@ -16,3 +16,35 @@ export const PyProcessResults = "# Convert back to PIL Image\nfor _, row in resu export const PySearchData = "# Search for similar images\nquery_vector = np.random.rand(128).astype(np.float32)\nresults = tbl.search(query_vector).limit(1).to_pandas()\n"; +export const TsBlobApiIngest = "const blobData = lancedb.makeArrowTable(\n [\n { id: 1, video: Buffer.from(\"fake_video_bytes_1\") },\n { id: 2, video: Buffer.from(\"fake_video_bytes_2\") },\n ],\n { schema: blobSchema },\n);\nconst blobTable = await db.createTable(\"videos\", blobData, {\n mode: \"overwrite\",\n});\n"; + +export const TsBlobApiSchema = "const blobSchema = new arrow.Schema([\n new arrow.Field(\"id\", new arrow.Int64()),\n new arrow.Field(\n \"video\",\n new arrow.LargeBinary(),\n true,\n new Map([[\"lance-encoding:blob\", \"true\"]]),\n ),\n]);\n"; + +export const TsCreateDummyData = "const createDummyImage = (color: string): Uint8Array => {\n const pngHeader = Uint8Array.from([137, 80, 78, 71, 13, 10, 26, 10]);\n return Buffer.concat([Buffer.from(pngHeader), Buffer.from(color, \"utf8\")]);\n};\n\nconst data = [\n {\n id: 1,\n filename: \"red_square.png\",\n vector: Array.from({ length: 128 }, (_, i) => (i % 16) / 16),\n image_blob: createDummyImage(\"red\"),\n label: \"red\",\n },\n {\n id: 2,\n filename: \"blue_square.png\",\n vector: Array.from({ length: 128 }, (_, i) => ((i + 8) % 16) / 16),\n image_blob: createDummyImage(\"blue\"),\n label: \"blue\",\n },\n];\n"; + +export const TsDefineSchema = "const schema = new arrow.Schema([\n new arrow.Field(\"id\", new arrow.Int32()),\n new arrow.Field(\"filename\", new arrow.Utf8()),\n new arrow.Field(\n \"vector\",\n new arrow.FixedSizeList(\n 128,\n new arrow.Field(\"item\", new arrow.Float32(), true),\n ),\n ),\n new arrow.Field(\"image_blob\", new arrow.Binary()),\n new arrow.Field(\"label\", new arrow.Utf8()),\n]);\n"; + +export const TsIngestData = "const multimodalData = lancedb.makeArrowTable(data, { schema });\nconst tbl = await db.createTable(\"images\", multimodalData, {\n mode: \"overwrite\",\n});\n"; + +export const TsMultimodalImports = "import * as arrow from \"apache-arrow\";\nimport { Buffer } from \"node:buffer\";\nimport * as lancedb from \"@lancedb/lancedb\";\n"; + +export const TsProcessResults = "for (const row of results) {\n const imageBytes = row.image_blob as Uint8Array;\n console.log(\n `Retrieved image: ${row.filename}, Byte length: ${imageBytes.length}`,\n );\n}\n"; + +export const TsSearchData = "const queryVector = Array.from({ length: 128 }, (_, i) => (i % 16) / 16);\nconst results = await tbl.search(queryVector).limit(1).toArray();\n"; + +export const RsBlobApiIngest = "let blob_rows = vec![\n (1_i64, b\"fake_video_bytes_1\".to_vec()),\n (2_i64, b\"fake_video_bytes_2\".to_vec()),\n];\n\nlet blob_schema = Arc::new(blob_schema);\nlet blob_batch = RecordBatch::try_new(\n blob_schema.clone(),\n vec![\n Arc::new(Int64Array::from_iter_values(blob_rows.iter().map(|row| row.0))),\n Arc::new(LargeBinaryArray::from_iter_values(\n blob_rows.iter().map(|row| row.1.as_slice()),\n )),\n ],\n)\n.unwrap();\nlet blob_reader = RecordBatchIterator::new(vec![Ok(blob_batch)].into_iter(), blob_schema);\nlet blob_table = db\n .create_table(\"videos\", blob_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; + +export const RsBlobApiSchema = "let blob_metadata = HashMap::from([(\n \"lance-encoding:blob\".to_string(),\n \"true\".to_string(),\n)]);\nlet blob_schema = Schema::new(vec![\n Field::new(\"id\", DataType::Int64, false),\n Field::new(\"video\", DataType::LargeBinary, true).with_metadata(blob_metadata),\n]);\n"; + +export const RsCreateDummyData = "let create_dummy_image = |color: u8| -> Vec {\n let mut png_like = vec![137, 80, 78, 71, 13, 10, 26, 10];\n png_like.push(color);\n png_like\n};\n\nlet data = vec![\n (\n 1_i32,\n \"red_square.png\",\n vec![0.1_f32; 128],\n create_dummy_image(1),\n \"red\",\n ),\n (\n 2_i32,\n \"blue_square.png\",\n vec![0.2_f32; 128],\n create_dummy_image(2),\n \"blue\",\n ),\n];\n"; + +export const RsDefineSchema = "let schema = Schema::new(vec![\n Field::new(\"id\", DataType::Int32, false),\n Field::new(\"filename\", DataType::Utf8, false),\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 128),\n false,\n ),\n Field::new(\"image_blob\", DataType::Binary, false),\n Field::new(\"label\", DataType::Utf8, false),\n]);\n"; + +export const RsIngestData = "let schema = Arc::new(schema);\nlet image_batch = RecordBatch::try_new(\n schema.clone(),\n vec![\n Arc::new(Int32Array::from_iter_values(data.iter().map(|row| row.0))),\n Arc::new(StringArray::from_iter_values(data.iter().map(|row| row.1))),\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n data.iter()\n .map(|row| Some(row.2.iter().copied().map(Some).collect::>())),\n 128,\n ),\n ),\n Arc::new(BinaryArray::from_iter_values(\n data.iter().map(|row| row.3.as_slice()),\n )),\n Arc::new(StringArray::from_iter_values(data.iter().map(|row| row.4))),\n ],\n)\n.unwrap();\nlet image_reader = RecordBatchIterator::new(vec![Ok(image_batch)].into_iter(), schema.clone());\nlet table = db\n .create_table(\"images\", image_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; + +export const RsMultimodalImports = "use std::collections::HashMap;\nuse std::sync::Arc;\n\nuse arrow_array::types::Float32Type;\nuse arrow_array::{\n BinaryArray, FixedSizeListArray, Int32Array, Int64Array, LargeBinaryArray, RecordBatch,\n RecordBatchIterator, StringArray,\n};\nuse arrow_schema::{DataType, Field, Schema};\nuse futures_util::TryStreamExt;\nuse lancedb::connect;\nuse lancedb::database::CreateTableMode;\nuse lancedb::query::{ExecutableQuery, QueryBase};\n"; + +export const RsProcessResults = "for batch in &results {\n let filenames = batch\n .column_by_name(\"filename\")\n .unwrap()\n .as_any()\n .downcast_ref::()\n .unwrap();\n let images = batch\n .column_by_name(\"image_blob\")\n .unwrap()\n .as_any()\n .downcast_ref::()\n .unwrap();\n\n for row in 0..batch.num_rows() {\n let image_bytes = images.value(row);\n println!(\n \"Retrieved image: {}, Byte length: {}\",\n filenames.value(row),\n image_bytes.len()\n );\n }\n}\n"; + +export const RsSearchData = "let query_vector = vec![0.1_f32; 128];\nlet results = table\n .query()\n .nearest_to(query_vector)\n .unwrap()\n .limit(1)\n .execute()\n .await\n .unwrap()\n .try_collect::>()\n .await\n .unwrap();\n"; + diff --git a/docs/snippets/tables.mdx b/docs/snippets/tables.mdx index ba7655c..d720d8d 100644 --- a/docs/snippets/tables.mdx +++ b/docs/snippets/tables.mdx @@ -26,11 +26,11 @@ export const PyAlterVectorColumn = "vector_dim = 768 # Your embedding dimension export const PyBatchDataInsertion = "import pyarrow as pa\n\ndef make_batches():\n for i in range(5): # Create 5 batches\n yield pa.RecordBatch.from_arrays(\n [\n pa.array([[3.1, 4.1], [5.9, 26.5]], pa.list_(pa.float32(), 2)),\n pa.array([f\"item{i*2+1}\", f\"item{i*2+2}\"]),\n pa.array([float((i * 2 + 1) * 10), float((i * 2 + 2) * 10)]),\n ],\n [\"vector\", \"item\", \"price\"],\n )\n\nschema = pa.schema(\n [\n pa.field(\"vector\", pa.list_(pa.float32(), 2)),\n pa.field(\"item\", pa.utf8()),\n pa.field(\"price\", pa.float32()),\n ]\n)\n# Create table with batches\ntable_name = \"batch_ingestion_example\"\ntable = db.create_table(table_name, make_batches(), schema=schema, mode=\"overwrite\")\n"; -export const PyConsistencyCheckoutLatest = "db = tmp_db\n# Create table first\ndata = [{\"vector\": [1.1, 1.2], \"lat\": 45.5}]\ntbl = db.create_table(\"test_table\", data, mode=\"overwrite\")\n\n# (Other writes happen to my_table from another process)\n\n# Check for updates\ntbl.checkout_latest()\n"; +export const PyConsistencyCheckoutLatest = "uri = str(tmp_db.uri)\nwriter_db = lancedb.connect(uri)\nreader_db = lancedb.connect(uri)\nwriter_table = writer_db.create_table(\"consistency_checkout_latest_table\", [{\"id\": 1}], mode=\"overwrite\")\nreader_table = reader_db.open_table(\"consistency_checkout_latest_table\")\n\nwriter_table.add([{\"id\": 2}])\nrows_before_refresh = reader_table.count_rows()\nprint(f\"Rows before checkout_latest: {rows_before_refresh}\")\n\nreader_table.checkout_latest()\nrows_after_refresh = reader_table.count_rows()\nprint(f\"Rows after checkout_latest: {rows_after_refresh}\")\n"; -export const PyConsistencyEventual = "from datetime import timedelta\n\nuri = str(tmp_db.uri) if hasattr(tmp_db, \"uri\") else \"memory://\"\ndb = lancedb.connect(uri, read_consistency_interval=timedelta(seconds=5))\n# Create table first\ndata = [{\"vector\": [1.1, 1.2], \"lat\": 45.5}]\ndb.create_table(\"test_table\", data, mode=\"overwrite\")\ntbl = db.open_table(\"test_table\")\n"; +export const PyConsistencyEventual = "from datetime import timedelta\n\nuri = str(tmp_db.uri)\nwriter_db = lancedb.connect(uri)\nreader_db = lancedb.connect(uri, read_consistency_interval=timedelta(seconds=3600))\nwriter_table = writer_db.create_table(\"consistency_eventual_table\", [{\"id\": 1}], mode=\"overwrite\")\nreader_table = reader_db.open_table(\"consistency_eventual_table\")\nwriter_table.add([{\"id\": 2}])\nrows_after_write = reader_table.count_rows()\nprint(f\"Rows visible before eventual refresh interval: {rows_after_write}\")\n"; -export const PyConsistencyStrong = "from datetime import timedelta\n\nuri = str(tmp_db.uri) if hasattr(tmp_db, \"uri\") else \"memory://\"\ndb = lancedb.connect(uri, read_consistency_interval=timedelta(0))\n# Create table first\ndata = [{\"vector\": [1.1, 1.2], \"lat\": 45.5}]\ndb.create_table(\"test_table\", data, mode=\"overwrite\")\ntbl = db.open_table(\"test_table\")\n"; +export const PyConsistencyStrong = "from datetime import timedelta\n\nuri = str(tmp_db.uri)\nwriter_db = lancedb.connect(uri)\nreader_db = lancedb.connect(uri, read_consistency_interval=timedelta(0))\nwriter_table = writer_db.create_table(\"consistency_strong_table\", [{\"id\": 1}], mode=\"overwrite\")\nreader_table = reader_db.open_table(\"consistency_strong_table\")\nwriter_table.add([{\"id\": 2}])\nrows_after_write = reader_table.count_rows()\nprint(f\"Rows visible with strong consistency: {rows_after_write}\")\n"; export const PyCreateEmptyTable = "import pyarrow as pa\n\nschema = pa.schema(\n [\n pa.field(\"vector\", pa.list_(pa.float32(), 2)),\n pa.field(\"item\", pa.string()),\n pa.field(\"price\", pa.float32()),\n ]\n)\ndb = tmp_db\ntbl = db.create_table(\"test_empty_table\", schema=schema, mode=\"overwrite\")\n"; @@ -70,7 +70,7 @@ export const PyMergePartialColumns = "import pyarrow as pa\n\ntable = db.create_ export const PyMergeUpdateInsert = "import pyarrow as pa\n\ntable = db.create_table(\n \"users_example\",\n data=pa.table(\n {\n \"id\": [1, 2],\n \"name\": [\"Alice\", \"Bob\"],\n \"login_count\": [10, 20],\n }\n ),\n mode=\"overwrite\",\n)\n\nincoming_users = pa.table(\n {\n \"id\": [2, 3],\n \"name\": [\"Bobby\", \"Charlie\"],\n \"login_count\": [21, 5],\n }\n)\n\n(\n table.merge_insert(\"id\")\n .when_matched_update_all()\n .when_not_matched_insert_all()\n .execute(incoming_users)\n)\n"; -export const PyOpenExistingTable = "db = tmp_db\n# Create a table first\ndata = [{\"vector\": [1.1, 1.2], \"lat\": 45.5, \"long\": -122.7}]\ndb.create_table(\"test_table\", data, mode=\"overwrite\")\n\n# List table names\nprint(db.table_names())\n\n# Open existing table\ntbl = db.open_table(\"test_table\")\n"; +export const PyOpenExistingTable = "db = tmp_db\n# Create a table first\ndata = [{\"vector\": [1.1, 1.2], \"lat\": 45.5, \"long\": -122.7}]\ndb.create_table(\"test_table\", data, mode=\"overwrite\")\n\n# List table names\nprint(db.list_tables().tables)\n\n# Open existing table\ntbl = db.open_table(\"test_table\")\n"; export const PySchemaAddSetup = "table_name = \"schema_evolution_add_example\"\nif data is None:\n data = [\n {\n \"id\": 1,\n \"name\": \"Laptop\",\n \"price\": 1200.00,\n \"vector\": np.random.random(128).tolist(),\n },\n {\n \"id\": 2,\n \"name\": \"Smartphone\",\n \"price\": 800.00,\n \"vector\": np.random.random(128).tolist(),\n },\n {\n \"id\": 3,\n \"name\": \"Headphones\",\n \"price\": 150.00,\n \"vector\": np.random.random(128).tolist(),\n },\n ]\ntable = tmp_db.create_table(table_name, data, mode=\"overwrite\")\n"; @@ -98,23 +98,23 @@ export const PyUpdateOptimizeCleanup = "from datetime import timedelta\n\ntable. export const PyUpdateUsingSql = "import pyarrow as pa\n\ntable = db.create_table(\n \"users_example\",\n data=pa.table(\n {\n \"id\": [1, 2],\n \"name\": [\"Alice\", \"Bob\"],\n \"login_count\": [10, 20],\n }\n ),\n mode=\"overwrite\",\n)\ntable.update(where=\"id = 2\", values_sql={\"login_count\": \"login_count + 1\"})\n"; -export const PyVersioningAddData = "# Add more data\nmore_data = [\n {\n \"id\": 4,\n \"author\": \"Richard Daniel Sanchez\",\n \"quote\": \"That's the way the news goes!\",\n },\n {\"id\": 5, \"author\": \"Morty\", \"quote\": \"Aww geez, Rick!\"},\n]\ntable.add(more_data)\n"; +export const PyVersioningAddData = "more_data = [\n {\n \"id\": 4,\n \"author\": \"Richard Daniel Sanchez\",\n \"quote\": \"That's the way the news goes!\",\n },\n {\"id\": 5, \"author\": \"Morty\", \"quote\": \"Aww geez, Rick!\"},\n]\ntable.add(more_data)\n"; -export const PyVersioningBasicSetup = "import lancedb\nimport numpy as np\nimport pandas as pd\nimport pyarrow as pa\n\n# Connect to LanceDB\ndb = tmp_db\n\n# Create a table with initial data\ntable_name = \"quotes_versioning_example\"\ndata = [\n {\"id\": 1, \"author\": \"Richard\", \"quote\": \"Wubba Lubba Dub Dub!\"},\n {\"id\": 2, \"author\": \"Morty\", \"quote\": \"Rick, what's going on?\"},\n {\n \"id\": 3,\n \"author\": \"Richard\",\n \"quote\": \"I turned myself into a pickle, Morty!\",\n },\n]\n\n# Define schema\nschema = pa.schema(\n [\n pa.field(\"id\", pa.int64()),\n pa.field(\"author\", pa.string()),\n pa.field(\"quote\", pa.string()),\n ]\n)\n\ntable = db.create_table(table_name, data, schema=schema, mode=\"overwrite\")\n"; +export const PyVersioningBasicSetup = "import pyarrow as pa\n\ndb = tmp_db\n\ntable_name = \"quotes_versioning_example\"\ndata = [\n {\"id\": 1, \"author\": \"Richard\", \"quote\": \"Wubba Lubba Dub Dub!\"},\n {\"id\": 2, \"author\": \"Morty\", \"quote\": \"Rick, what's going on?\"},\n {\n \"id\": 3,\n \"author\": \"Richard\",\n \"quote\": \"I turned myself into a pickle, Morty!\",\n },\n]\n\n# Define schema\nschema = pa.schema(\n [\n pa.field(\"id\", pa.int64()),\n pa.field(\"author\", pa.string()),\n pa.field(\"quote\", pa.string()),\n ]\n)\n\ntable = db.create_table(table_name, data, schema=schema, mode=\"overwrite\")\n"; -export const PyVersioningCheckInitialVersion = "# View the initial version\nversions = table.list_versions()\nprint(f\"Number of versions after creation: {len(versions)}\")\nprint(f\"Current version: {table.version}\")\n"; +export const PyVersioningCheckInitialVersion = "versions = table.list_versions()\ncurrent_version = table.version\nprint(f\"Number of versions after creation: {len(versions)}\")\nprint(f\"Current version: {current_version}\")\n"; -export const PyVersioningCheckVersionsAfterMod = "# Check versions after modifications\nversions = table.list_versions()\nversion_count_after_mod = len(versions)\nversion_after_mod = table.version\nprint(f\"Number of versions after modifications: {version_count_after_mod}\")\nprint(f\"Current version: {version_after_mod}\")\n"; +export const PyVersioningCheckVersionsAfterMod = "versions = table.list_versions()\nversion_count_after_mod = len(versions)\nversion_after_mod = table.version\nprint(f\"Number of versions after modifications: {version_count_after_mod}\")\nprint(f\"Current version: {version_after_mod}\")\n"; -export const PyVersioningCheckoutLatest = "# Go back to the latest version\ntable.checkout_latest()\n"; +export const PyVersioningCheckoutLatest = "table.checkout_latest()\n"; -export const PyVersioningDeleteData = "# Let's delete data from the table\ntable.delete(\"author != 'Richard Daniel Sanchez'\")\nrows_after_deletion = table.count_rows()\nprint(f\"Number of rows after deletion: {rows_after_deletion}\")\n"; +export const PyVersioningDeleteData = "table.delete(\"author = 'Morty'\")\nrows_after_deletion = table.count_rows()\nprint(f\"Number of rows after deletion: {rows_after_deletion}\")\n"; -export const PyVersioningListAllVersions = "# Let's see all versions\nversions = table.list_versions()\nfor v in versions:\n print(f\"Version {v['version']}, created at {v['timestamp']}\")\n"; +export const PyVersioningListAllVersions = "versions = table.list_versions()\nfor v in versions:\n print(f\"Version {v['version']}, created at {v['timestamp']}\")\n"; -export const PyVersioningRollback = "# Let's roll back to before we added the vector column\n# We'll use the version after modifications but before adding embeddings\ntable.restore(version_after_mod)\n\n# Notice we have one more version now, not less!\nversions = table.list_versions()\nversion_count_after_rollback = len(versions)\nprint(f\"Total number of versions after rollback: {version_count_after_rollback}\")\n"; +export const PyVersioningRollback = "table.restore(version_after_mod)\nversions = table.list_versions()\nversion_count_after_rollback = len(versions)\nprint(f\"Total number of versions after rollback: {version_count_after_rollback}\")\n"; -export const PyVersioningUpdateData = "# Update author names to be more specific\ntable.update(where=\"author='Richard'\", values={\"author\": \"Richard Daniel Sanchez\"})\nrows_after_update = table.count_rows()\nprint(f\"Number of rows after update: {rows_after_update}\")\n"; +export const PyVersioningUpdateData = "table.update(where=\"author='Richard'\", values={\"author\": \"Richard Daniel Sanchez\"})\nrows_after_update = table.count_rows(\"author = 'Richard Daniel Sanchez'\")\nprint(f\"Rows updated to Richard Daniel Sanchez: {rows_after_update}\")\n"; export const TsAddColumnsCalculated = "// Add a discounted price column (10% discount)\nawait schemaAddTable.addColumns([\n {\n name: \"discounted_price\",\n valueSql: \"cast((price * 0.9) as float)\",\n },\n]);\n"; @@ -134,6 +134,12 @@ export const TsAlterColumnsWithExpression = "// For custom transforms, create a export const TsAlterVectorColumn = "const oldDim = 384;\nconst newDim = 1024;\nconst vectorSchema = new arrow.Schema([\n new arrow.Field(\"id\", new arrow.Int64()),\n new arrow.Field(\n \"embedding\",\n new arrow.FixedSizeList(\n oldDim,\n new arrow.Field(\"item\", new arrow.Float16(), true),\n ),\n true,\n ),\n]);\nconst vectorData = lancedb.makeArrowTable(\n [{ id: 1, embedding: Array.from({ length: oldDim }, () => Math.random()) }],\n { schema: vectorSchema },\n);\nconst vectorTable = await db.createTable(\"vector_alter_example\", vectorData, {\n mode: \"overwrite\",\n});\n\n// Changing FixedSizeList dimensions (384 -> 1024) is not supported via alterColumns.\n// Use addColumns + dropColumns + alterColumns(rename) to replace the column.\nawait vectorTable.addColumns([\n {\n name: \"embedding_v2\",\n valueSql: `arrow_cast(NULL, 'FixedSizeList(${newDim}, Float16)')`,\n },\n]);\nawait vectorTable.dropColumns([\"embedding\"]);\nawait vectorTable.alterColumns([{ path: \"embedding_v2\", rename: \"embedding\" }]);\n"; +export const TsConsistencyCheckoutLatest = "const checkoutWriterDb = await lancedb.connect(databaseDir);\nconst checkoutReaderDb = await lancedb.connect(databaseDir);\nconst checkoutWriterTable = await checkoutWriterDb.createTable(\n \"consistency_checkout_latest_table\",\n [{ id: 1 }],\n { mode: \"overwrite\" },\n);\nconst checkoutReaderTable = await checkoutReaderDb.openTable(\n \"consistency_checkout_latest_table\",\n);\nawait checkoutWriterTable.add([{ id: 2 }]);\nconst rowsBeforeRefresh = await checkoutReaderTable.countRows();\nconsole.log(`Rows before checkoutLatest: ${rowsBeforeRefresh}`);\nawait checkoutReaderTable.checkoutLatest();\nconst rowsAfterRefresh = await checkoutReaderTable.countRows();\nconsole.log(`Rows after checkoutLatest: ${rowsAfterRefresh}`);\n"; + +export const TsConsistencyEventual = "const eventualWriterDb = await lancedb.connect(databaseDir);\nconst eventualReaderDb = await lancedb.connect(databaseDir, {\n readConsistencyInterval: 3600,\n});\nconst eventualWriterTable = await eventualWriterDb.createTable(\n \"consistency_eventual_table\",\n [{ id: 1 }],\n { mode: \"overwrite\" },\n);\nconst eventualReaderTable = await eventualReaderDb.openTable(\n \"consistency_eventual_table\",\n);\nawait eventualWriterTable.add([{ id: 2 }]);\nconst eventualRowsAfterWrite = await eventualReaderTable.countRows();\nconsole.log(\n `Rows visible before eventual refresh interval: ${eventualRowsAfterWrite}`,\n);\n"; + +export const TsConsistencyStrong = "const strongWriterDb = await lancedb.connect(databaseDir);\nconst strongReaderDb = await lancedb.connect(databaseDir, {\n readConsistencyInterval: 0,\n});\nconst strongWriterTable = await strongWriterDb.createTable(\n \"consistency_strong_table\",\n [{ id: 1 }],\n { mode: \"overwrite\" },\n);\nconst strongReaderTable = await strongReaderDb.openTable(\n \"consistency_strong_table\",\n);\nawait strongWriterTable.add([{ id: 2 }]);\nconst strongRowsAfterWrite = await strongReaderTable.countRows();\nconsole.log(`Rows visible with strong consistency: ${strongRowsAfterWrite}`);\n"; + export const TsCreateEmptyTable = "const emptySchema = new arrow.Schema([\n new arrow.Field(\n \"vector\",\n new arrow.FixedSizeList(\n 2,\n new arrow.Field(\"item\", new arrow.Float32(), true),\n ),\n ),\n new arrow.Field(\"item\", new arrow.Utf8()),\n new arrow.Field(\"price\", new arrow.Float32()),\n]);\nconst emptyTable = await db.createEmptyTable(\n \"test_empty_table\",\n emptySchema,\n {\n mode: \"overwrite\",\n },\n);\n"; export const TsCreateTableCustomSchema = "const customSchema = new arrow.Schema([\n new arrow.Field(\n \"vector\",\n new arrow.FixedSizeList(\n 4,\n new arrow.Field(\"item\", new arrow.Float32(), true),\n ),\n ),\n new arrow.Field(\"lat\", new arrow.Float32()),\n new arrow.Field(\"long\", new arrow.Float32()),\n]);\n\nconst customSchemaData = lancedb.makeArrowTable(\n [\n { vector: [1.1, 1.2, 1.3, 1.4], lat: 45.5, long: -122.7 },\n { vector: [0.2, 1.8, 0.4, 3.6], lat: 40.1, long: -74.1 },\n ],\n { schema: customSchema },\n);\nconst customSchemaTable = await db.createTable(\n \"my_table_custom_schema\",\n customSchemaData,\n { mode: \"overwrite\" },\n);\n"; @@ -144,12 +150,24 @@ export const TsCreateTableFromDicts = "type Location = {\n vector: number[];\n export const TsCreateTableFromIterator = "const batchSchema = new arrow.Schema([\n new arrow.Field(\n \"vector\",\n new arrow.FixedSizeList(\n 4,\n new arrow.Field(\"item\", new arrow.Float32(), true),\n ),\n ),\n new arrow.Field(\"item\", new arrow.Utf8()),\n new arrow.Field(\"price\", new arrow.Float32()),\n]);\n\nconst tableForBatches = await db.createEmptyTable(\n \"batched_table\",\n batchSchema,\n {\n mode: \"overwrite\",\n },\n);\n\nconst rows = Array.from({ length: 10 }, (_, i) => ({\n vector: [i + 0.1, i + 0.2, i + 0.3, i + 0.4],\n item: `item-${i + 1}`,\n price: (i + 1) * 10,\n}));\n\nconst chunkSize = 2;\nfor (let i = 0; i < rows.length; i += chunkSize) {\n const batch = lancedb.makeArrowTable(rows.slice(i, i + chunkSize), {\n schema: batchSchema,\n });\n await tableForBatches.add(batch);\n}\n"; +export const TsDeleteOperation = "// delete data\nconst predicate = \"id = 3\";\nawait table.delete(predicate);\n"; + export const TsDropColumnsMultiple = "// Remove the second temporary column\nawait schemaDropTable.dropColumns([\"temp_col2\"]);\n"; export const TsDropColumnsSingle = "// Remove the first temporary column\nawait schemaDropTable.dropColumns([\"temp_col1\"]);\n"; export const TsDropTable = "await db.createTable(\"my_table\", [{ vector: [1.1, 1.2], lat: 45.5 }], {\n mode: \"overwrite\",\n});\n\nawait db.dropTable(\"my_table\");\n"; +export const TsInsertIfNotExists = "const table = await db.createTable(\n \"users_example\",\n [\n { id: 1, name: \"Alice\", login_count: 10 },\n { id: 2, name: \"Bob\", login_count: 20 },\n ],\n { mode: \"overwrite\" },\n);\n\nconst incomingUsers = [\n { id: 2, name: \"Bobby\", login_count: 21 },\n { id: 3, name: \"Charlie\", login_count: 5 },\n];\n\nawait table\n .mergeInsert(\"id\")\n .whenNotMatchedInsertAll()\n .execute(incomingUsers);\n"; + +export const TsMergeDeleteMissingBySource = "const table = await db.createTable(\n \"users_example\",\n [\n { id: 1, name: \"Alice\", login_count: 10 },\n { id: 2, name: \"Bob\", login_count: 20 },\n { id: 3, name: \"Charlie\", login_count: 5 },\n ],\n { mode: \"overwrite\" },\n);\n\nconst incomingUsers = [\n { id: 2, name: \"Bobby\", login_count: 21 },\n { id: 3, name: \"Charlie\", login_count: 5 },\n];\n\nawait table\n .mergeInsert(\"id\")\n .whenMatchedUpdateAll()\n .whenNotMatchedInsertAll()\n .whenNotMatchedBySourceDelete()\n .execute(incomingUsers);\n"; + +export const TsMergeMatchedUpdateOnly = "const table = await db.createTable(\n \"users_example\",\n [\n { id: 1, name: \"Alice\", login_count: 10 },\n { id: 2, name: \"Bob\", login_count: 20 },\n ],\n { mode: \"overwrite\" },\n);\n\nconst incomingUsers = [\n { id: 2, name: \"Bobby\", login_count: 21 },\n { id: 3, name: \"Charlie\", login_count: 5 },\n];\n\nawait table\n .mergeInsert(\"id\")\n .whenMatchedUpdateAll()\n .execute(incomingUsers);\n"; + +export const TsMergePartialColumns = "const table = await db.createTable(\n \"users_example\",\n [\n { id: 1, name: \"Alice\", login_count: 10 },\n { id: 2, name: \"Bob\", login_count: 20 },\n ],\n { mode: \"overwrite\" },\n);\n\nconst incomingUsers = [\n { id: 2, name: \"Bobby\" },\n { id: 3, name: \"Charlie\" },\n];\n\nawait table\n .mergeInsert(\"id\")\n .whenMatchedUpdateAll()\n .whenNotMatchedInsertAll()\n .execute(incomingUsers);\n"; + +export const TsMergeUpdateInsert = "const table = await db.createTable(\n \"users_example\",\n [\n { id: 1, name: \"Alice\", login_count: 10 },\n { id: 2, name: \"Bob\", login_count: 20 },\n ],\n { mode: \"overwrite\" },\n);\n\nconst incomingUsers = [\n { id: 2, name: \"Bobby\", login_count: 21 },\n { id: 3, name: \"Charlie\", login_count: 5 },\n];\n\nawait table\n .mergeInsert(\"id\")\n .whenMatchedUpdateAll()\n .whenNotMatchedInsertAll()\n .execute(incomingUsers);\n"; + export const TsOpenExistingTable = "const openTableData = [{ vector: [1.1, 1.2], lat: 45.5, long: -122.7 }];\nawait db.createTable(\"test_table_open\", openTableData, {\n mode: \"overwrite\",\n});\n\nconsole.log(await db.tableNames());\n\nconst openedTable = await db.openTable(\"test_table_open\");\n"; export const TsSchemaAddSetup = "const schemaAddData = [\n {\n id: 1,\n name: \"Laptop\",\n price: 1200.0,\n vector: Array.from({ length: 128 }, () => Math.random()),\n },\n {\n id: 2,\n name: \"Smartphone\",\n price: 800.0,\n vector: Array.from({ length: 128 }, () => Math.random()),\n },\n {\n id: 3,\n name: \"Headphones\",\n price: 150.0,\n vector: Array.from({ length: 128 }, () => Math.random()),\n },\n];\nconst schemaAddTable = await db.createTable(\n \"schema_evolution_add_example\",\n schemaAddData,\n { mode: \"overwrite\" },\n);\n"; @@ -158,6 +176,36 @@ export const TsSchemaAlterSetup = "const schemaAlter = new arrow.Schema([\n new export const TsSchemaDropSetup = "const schemaDropData = [\n {\n id: 1,\n name: \"Laptop\",\n price: 1200.0,\n temp_col1: \"X\",\n temp_col2: 100,\n vector: Array.from({ length: 128 }, () => Math.random()),\n },\n {\n id: 2,\n name: \"Smartphone\",\n price: 800.0,\n temp_col1: \"Y\",\n temp_col2: 200,\n vector: Array.from({ length: 128 }, () => Math.random()),\n },\n {\n id: 3,\n name: \"Headphones\",\n price: 150.0,\n temp_col1: \"Z\",\n temp_col2: 300,\n vector: Array.from({ length: 128 }, () => Math.random()),\n },\n];\nconst schemaDropTable = await db.createTable(\n \"schema_evolution_drop_example\",\n schemaDropData,\n { mode: \"overwrite\" },\n);\n"; +export const TsUpdateConnectCloud = "const db = await lancedb.connect(\"db://your-project-slug\", {\n apiKey: \"your-api-key\",\n region: \"us-east-1\",\n});\n"; + +export const TsUpdateConnectLocal = "const db = await lancedb.connect(\"./data\");\n"; + +export const TsUpdateExampleTableSetup = "const table = await db.createTable(\n \"users_example\",\n [\n { id: 1, name: \"Alice\", login_count: 10 },\n { id: 2, name: \"Bob\", login_count: 20 },\n ],\n { mode: \"overwrite\" },\n);\n"; + +export const TsUpdateOperation = "const table = await db.createTable(\n \"users_example\",\n [\n { id: 1, name: \"Alice\", login_count: 10 },\n { id: 2, name: \"Bob\", login_count: 20 },\n ],\n { mode: \"overwrite\" },\n);\nawait table.update({ where: \"id = 2\", values: { name: \"Bobby\" } });\n"; + +export const TsUpdateOptimizeCleanup = "const olderThan = new Date();\nolderThan.setDate(olderThan.getDate() - 1);\nawait table.optimize({ cleanupOlderThan: olderThan });\n"; + +export const TsUpdateUsingSql = "const table = await db.createTable(\n \"users_example\",\n [\n { id: 1, name: \"Alice\", login_count: 10 },\n { id: 2, name: \"Bob\", login_count: 20 },\n ],\n { mode: \"overwrite\" },\n);\nawait table.update({\n where: \"id = 2\",\n valuesSql: { login_count: \"login_count + 1\" },\n});\n"; + +export const TsVersioningAddData = "const moreData = [\n {\n id: 4,\n author: \"Richard Daniel Sanchez\",\n quote: \"That's the way the news goes!\",\n },\n { id: 5, author: \"Morty\", quote: \"Aww geez, Rick!\" },\n];\nawait table.add(moreData);\n"; + +export const TsVersioningBasicSetup = "const tableName = \"quotes_versioning_example\";\nconst data = [\n { id: 1, author: \"Richard\", quote: \"Wubba Lubba Dub Dub!\" },\n { id: 2, author: \"Morty\", quote: \"Rick, what's going on?\" },\n {\n id: 3,\n author: \"Richard\",\n quote: \"I turned myself into a pickle, Morty!\",\n },\n];\nconst table = await db.createTable(tableName, data, { mode: \"overwrite\" });\n"; + +export const TsVersioningCheckInitialVersion = "const versions = await table.listVersions();\nconst currentVersion = await table.version();\nconsole.log(`Number of versions after creation: ${versions.length}`);\nconsole.log(`Current version: ${currentVersion}`);\n"; + +export const TsVersioningCheckVersionsAfterMod = "const versionsAfterMod = await table.listVersions();\nconst versionCountAfterMod = versionsAfterMod.length;\nconst versionAfterMod = await table.version();\nconsole.log(\n `Number of versions after modifications: ${versionCountAfterMod}`,\n);\nconsole.log(`Current version: ${versionAfterMod}`);\n"; + +export const TsVersioningCheckoutLatest = "await table.checkoutLatest();\n"; + +export const TsVersioningDeleteData = "await table.delete(\"author = 'Morty'\");\nconst rowsAfterDeletion = await table.countRows();\nconsole.log(`Number of rows after deletion: ${rowsAfterDeletion}`);\n"; + +export const TsVersioningListAllVersions = "const allVersions = await table.listVersions();\nfor (const v of allVersions) {\n console.log(`Version ${v.version}, created at ${v.timestamp}`);\n}\n"; + +export const TsVersioningRollback = "await table.checkout(versionAfterMod);\nawait table.restore();\nconst versionsAfterRollback = await table.listVersions();\nconst versionCountAfterRollback = versionsAfterRollback.length;\nconsole.log(\n `Total number of versions after rollback: ${versionCountAfterRollback}`,\n);\n"; + +export const TsVersioningUpdateData = "await table.update({\n where: \"author = 'Richard'\",\n values: { author: \"Richard Daniel Sanchez\" },\n});\nconst rowsAfterUpdate = await table.countRows(\n \"author = 'Richard Daniel Sanchez'\",\n);\nconsole.log(`Rows updated to Richard Daniel Sanchez: ${rowsAfterUpdate}`);\n"; + export const RsAddColumnsCalculated = "// Add a discounted price column (10% discount)\nschema_add_table\n .add_columns(\n NewColumnTransform::SqlExpressions(vec![(\n \"discounted_price\".to_string(),\n \"cast((price * 0.9) as float)\".to_string(),\n )]),\n None,\n )\n .await\n .unwrap();\n"; export const RsAddColumnsDefaultValues = "// Add a stock status column with default value\nschema_add_table\n .add_columns(\n NewColumnTransform::SqlExpressions(vec![(\n \"in_stock\".to_string(),\n \"cast(true as boolean)\".to_string(),\n )]),\n None,\n )\n .await\n .unwrap();\n"; @@ -176,6 +224,12 @@ export const RsAlterColumnsWithExpression = "// For custom transforms, create a export const RsAlterVectorColumn = "let old_dim = 384;\nlet new_dim = 1024;\nlet vector_schema = Arc::new(Schema::new(vec![\n Field::new(\"id\", DataType::Int64, false),\n Field::new(\n \"embedding\",\n DataType::FixedSizeList(\n Arc::new(Field::new(\"item\", DataType::Float32, true)),\n old_dim,\n ),\n true,\n ),\n]));\nlet vector_batch = RecordBatch::try_new(\n vector_schema.clone(),\n vec![\n Arc::new(Int64Array::from(vec![1])),\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![Some(vec![Some(0.1_f32); old_dim as usize])],\n old_dim,\n ),\n ),\n ],\n)\n.unwrap();\nlet vector_reader =\n RecordBatchIterator::new(vec![Ok(vector_batch)].into_iter(), vector_schema.clone());\nlet vector_table = db\n .create_table(\"vector_alter_example\", vector_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n\n// Changing FixedSizeList dimensions (384 -> 1024) is not supported via alter_columns.\n// Use add_columns + drop_columns + alter_columns(rename) to replace the column.\nvector_table\n .add_columns(\n NewColumnTransform::SqlExpressions(vec![(\n \"embedding_v2\".to_string(),\n format!(\"arrow_cast(NULL, 'FixedSizeList({}, Float32)')\", new_dim),\n )]),\n None,\n )\n .await\n .unwrap();\nvector_table.drop_columns(&[\"embedding\"]).await.unwrap();\nvector_table\n .alter_columns(&[ColumnAlteration::new(\"embedding_v2\".to_string())\n .rename(\"embedding\".to_string())])\n .await\n .unwrap();\n"; +export const RsConsistencyCheckoutLatest = "let checkout_writer_db = connect(&db_uri).execute().await.unwrap();\nlet checkout_reader_db = connect(&db_uri).execute().await.unwrap();\nlet checkout_writer_table = checkout_writer_db\n .create_table(\n \"consistency_checkout_latest_table\",\n make_users_reader(vec![1], vec![\"Alice\"], None),\n )\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\nlet checkout_reader_table = checkout_reader_db\n .open_table(\"consistency_checkout_latest_table\")\n .execute()\n .await\n .unwrap();\ncheckout_writer_table\n .add(make_users_reader(vec![2], vec![\"Bob\"], None))\n .execute()\n .await\n .unwrap();\nlet rows_before_refresh = checkout_reader_table.count_rows(None).await.unwrap();\nprintln!(\"Rows before checkout_latest: {}\", rows_before_refresh);\ncheckout_reader_table.checkout_latest().await.unwrap();\nlet rows_after_refresh = checkout_reader_table.count_rows(None).await.unwrap();\nprintln!(\"Rows after checkout_latest: {}\", rows_after_refresh);\n"; + +export const RsConsistencyEventual = "let eventual_writer_db = connect(&db_uri).execute().await.unwrap();\nlet eventual_reader_db = connect(&db_uri)\n .read_consistency_interval(StdDuration::from_secs(3600))\n .execute()\n .await\n .unwrap();\nlet eventual_writer_table = eventual_writer_db\n .create_table(\n \"consistency_eventual_table\",\n make_users_reader(vec![1], vec![\"Alice\"], None),\n )\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\nlet eventual_reader_table = eventual_reader_db\n .open_table(\"consistency_eventual_table\")\n .execute()\n .await\n .unwrap();\neventual_writer_table\n .add(make_users_reader(vec![2], vec![\"Bob\"], None))\n .execute()\n .await\n .unwrap();\nlet eventual_rows_after_write = eventual_reader_table.count_rows(None).await.unwrap();\nprintln!(\n \"Rows visible before eventual refresh interval: {}\",\n eventual_rows_after_write\n);\n"; + +export const RsConsistencyStrong = "let strong_writer_db = connect(&db_uri).execute().await.unwrap();\nlet strong_reader_db = connect(&db_uri)\n .read_consistency_interval(StdDuration::from_secs(0))\n .execute()\n .await\n .unwrap();\nlet strong_writer_table = strong_writer_db\n .create_table(\n \"consistency_strong_table\",\n make_users_reader(vec![1], vec![\"Alice\"], None),\n )\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\nlet strong_reader_table = strong_reader_db\n .open_table(\"consistency_strong_table\")\n .execute()\n .await\n .unwrap();\nstrong_writer_table\n .add(make_users_reader(vec![2], vec![\"Bob\"], None))\n .execute()\n .await\n .unwrap();\nlet strong_rows_after_write = strong_reader_table.count_rows(None).await.unwrap();\nprintln!(\n \"Rows visible with strong consistency: {}\",\n strong_rows_after_write\n);\n"; + export const RsCreateEmptyTable = "let empty_schema = Arc::new(Schema::new(vec![\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 2),\n false,\n ),\n Field::new(\"item\", DataType::Utf8, false),\n Field::new(\"price\", DataType::Float32, false),\n]));\nlet empty_table = db\n .create_empty_table(\"test_empty_table\", empty_schema)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; export const RsCreateTableCustomSchema = "let custom_schema = Arc::new(Schema::new(vec![\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 4),\n false,\n ),\n Field::new(\"lat\", DataType::Float32, false),\n Field::new(\"long\", DataType::Float32, false),\n]));\n\nlet custom_batch = RecordBatch::try_new(\n custom_schema.clone(),\n vec![\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![\n Some(vec![Some(1.1), Some(1.2), Some(1.3), Some(1.4)]),\n Some(vec![Some(0.2), Some(1.8), Some(0.4), Some(3.6)]),\n ],\n 4,\n ),\n ),\n Arc::new(Float32Array::from(vec![45.5, 40.1])),\n Arc::new(Float32Array::from(vec![-122.7, -74.1])),\n ],\n)\n.unwrap();\nlet custom_reader =\n RecordBatchIterator::new(vec![Ok(custom_batch)].into_iter(), custom_schema.clone());\nlet custom_table = db\n .create_table(\"my_table_custom_schema\", custom_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; @@ -186,12 +240,24 @@ export const RsCreateTableFromDicts = "struct Location {\n vector: [f32; 2],\ export const RsCreateTableFromIterator = "let batch_schema = Arc::new(Schema::new(vec![\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 4),\n false,\n ),\n Field::new(\"item\", DataType::Utf8, false),\n Field::new(\"price\", DataType::Float32, false),\n]));\n\nlet batches = (0..5)\n .map(|i| {\n RecordBatch::try_new(\n batch_schema.clone(),\n vec![\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![\n Some(vec![Some(3.1 + i as f32), Some(4.1), Some(5.1), Some(6.1)]),\n Some(vec![\n Some(5.9),\n Some(26.5 + i as f32),\n Some(4.7),\n Some(32.8),\n ]),\n ],\n 4,\n ),\n ),\n Arc::new(StringArray::from(vec![\n format!(\"item{}\", i * 2 + 1),\n format!(\"item{}\", i * 2 + 2),\n ])),\n Arc::new(Float32Array::from(vec![\n ((i * 2 + 1) * 10) as f32,\n ((i * 2 + 2) * 10) as f32,\n ])),\n ],\n )\n .unwrap()\n })\n .collect::>();\n\nlet batch_reader = RecordBatchIterator::new(batches.into_iter().map(Ok), batch_schema.clone());\nlet batch_table = db\n .create_table(\"batched_table\", batch_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; +export const RsDeleteOperation = "// delete data\nlet predicate = \"id = 3\";\ntable.delete(predicate).await.unwrap();\n"; + export const RsDropColumnsMultiple = "// Remove the second temporary column\nschema_drop_table.drop_columns(&[\"temp_col2\"]).await.unwrap();\n"; export const RsDropColumnsSingle = "// Remove the first temporary column\nschema_drop_table.drop_columns(&[\"temp_col1\"]).await.unwrap();\n"; export const RsDropTable = "let drop_schema = Arc::new(Schema::new(vec![\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 2),\n false,\n ),\n Field::new(\"lat\", DataType::Float32, false),\n]));\nlet drop_batch = RecordBatch::try_new(\n drop_schema.clone(),\n vec![\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![Some(vec![Some(1.1), Some(1.2)])],\n 2,\n ),\n ),\n Arc::new(Float32Array::from(vec![45.5])),\n ],\n)\n.unwrap();\nlet drop_reader =\n RecordBatchIterator::new(vec![Ok(drop_batch)].into_iter(), drop_schema.clone());\ndb.create_table(\"my_table\", drop_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n\ndb.drop_table(\"my_table\", &[]).await.unwrap();\n"; +export const RsInsertIfNotExists = "let table = db\n .create_table(\n \"users_example\",\n make_users_reader(vec![1, 2], vec![\"Alice\", \"Bob\"], Some(vec![10, 20])),\n )\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n\nlet mut merge_insert = table.merge_insert(&[\"id\"]);\nmerge_insert.when_not_matched_insert_all();\nmerge_insert\n .execute(make_users_reader(\n vec![2, 3],\n vec![\"Bobby\", \"Charlie\"],\n Some(vec![21, 5]),\n ))\n .await\n .unwrap();\n"; + +export const RsMergeDeleteMissingBySource = "let table = db\n .create_table(\n \"users_example\",\n make_users_reader(\n vec![1, 2, 3],\n vec![\"Alice\", \"Bob\", \"Charlie\"],\n Some(vec![10, 20, 5]),\n ),\n )\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n\nlet mut merge_insert = table.merge_insert(&[\"id\"]);\nmerge_insert\n .when_matched_update_all(None)\n .when_not_matched_insert_all()\n .when_not_matched_by_source_delete(None);\nmerge_insert\n .execute(make_users_reader(\n vec![2, 3],\n vec![\"Bobby\", \"Charlie\"],\n Some(vec![21, 5]),\n ))\n .await\n .unwrap();\n"; + +export const RsMergeMatchedUpdateOnly = "let table = db\n .create_table(\n \"users_example\",\n make_users_reader(vec![1, 2], vec![\"Alice\", \"Bob\"], Some(vec![10, 20])),\n )\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n\nlet mut merge_insert = table.merge_insert(&[\"id\"]);\nmerge_insert.when_matched_update_all(None);\nmerge_insert\n .execute(make_users_reader(\n vec![2, 3],\n vec![\"Bobby\", \"Charlie\"],\n Some(vec![21, 5]),\n ))\n .await\n .unwrap();\n"; + +export const RsMergePartialColumns = "let table = db\n .create_table(\n \"users_example\",\n make_users_reader(vec![1, 2], vec![\"Alice\", \"Bob\"], Some(vec![10, 20])),\n )\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n\nlet mut merge_insert = table.merge_insert(&[\"id\"]);\nmerge_insert\n .when_matched_update_all(None)\n .when_not_matched_insert_all();\nmerge_insert\n .execute(make_users_reader(vec![2, 3], vec![\"Bobby\", \"Charlie\"], None))\n .await\n .unwrap();\n"; + +export const RsMergeUpdateInsert = "let table = db\n .create_table(\n \"users_example\",\n make_users_reader(vec![1, 2], vec![\"Alice\", \"Bob\"], Some(vec![10, 20])),\n )\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n\nlet mut merge_insert = table.merge_insert(&[\"id\"]);\nmerge_insert\n .when_matched_update_all(None)\n .when_not_matched_insert_all();\nmerge_insert\n .execute(make_users_reader(\n vec![2, 3],\n vec![\"Bobby\", \"Charlie\"],\n Some(vec![21, 5]),\n ))\n .await\n .unwrap();\n"; + export const RsOpenExistingTable = "let open_schema = Arc::new(Schema::new(vec![\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 2),\n false,\n ),\n Field::new(\"lat\", DataType::Float32, false),\n Field::new(\"long\", DataType::Float32, false),\n]));\nlet open_batch = RecordBatch::try_new(\n open_schema.clone(),\n vec![\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![Some(vec![Some(1.1), Some(1.2)])],\n 2,\n ),\n ),\n Arc::new(Float32Array::from(vec![45.5])),\n Arc::new(Float32Array::from(vec![-122.7])),\n ],\n)\n.unwrap();\nlet open_reader =\n RecordBatchIterator::new(vec![Ok(open_batch)].into_iter(), open_schema.clone());\ndb.create_table(\"test_table\", open_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n\nprintln!(\"{:?}\", db.table_names().execute().await.unwrap());\n\nlet opened_table = db.open_table(\"test_table\").execute().await.unwrap();\n"; export const RsSchemaAddSetup = "let schema_add_schema = Arc::new(Schema::new(vec![\n Field::new(\"id\", DataType::Int64, false),\n Field::new(\"name\", DataType::Utf8, false),\n Field::new(\"price\", DataType::Float64, false),\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 128),\n false,\n ),\n]));\nlet schema_add_batch = RecordBatch::try_new(\n schema_add_schema.clone(),\n vec![\n Arc::new(Int64Array::from(vec![1, 2, 3])),\n Arc::new(StringArray::from(vec![\"Laptop\", \"Smartphone\", \"Headphones\"])),\n Arc::new(Float64Array::from(vec![1200.0, 800.0, 150.0])),\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![\n Some(vec![Some(0.1_f32); 128]),\n Some(vec![Some(0.2_f32); 128]),\n Some(vec![Some(0.3_f32); 128]),\n ],\n 128,\n ),\n ),\n ],\n)\n.unwrap();\nlet schema_add_reader = RecordBatchIterator::new(\n vec![Ok(schema_add_batch)].into_iter(),\n schema_add_schema.clone(),\n);\nlet schema_add_table = db\n .create_table(\"schema_evolution_add_example\", schema_add_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; @@ -200,3 +266,37 @@ export const RsSchemaAlterSetup = "let schema_alter_schema = Arc::new(Schema::ne export const RsSchemaDropSetup = "let schema_drop_schema = Arc::new(Schema::new(vec![\n Field::new(\"id\", DataType::Int64, false),\n Field::new(\"name\", DataType::Utf8, false),\n Field::new(\"price\", DataType::Float64, false),\n Field::new(\"temp_col1\", DataType::Utf8, false),\n Field::new(\"temp_col2\", DataType::Int32, false),\n Field::new(\n \"vector\",\n DataType::FixedSizeList(Arc::new(Field::new(\"item\", DataType::Float32, true)), 128),\n false,\n ),\n]));\nlet schema_drop_batch = RecordBatch::try_new(\n schema_drop_schema.clone(),\n vec![\n Arc::new(Int64Array::from(vec![1, 2, 3])),\n Arc::new(StringArray::from(vec![\"Laptop\", \"Smartphone\", \"Headphones\"])),\n Arc::new(Float64Array::from(vec![1200.0, 800.0, 150.0])),\n Arc::new(StringArray::from(vec![\"X\", \"Y\", \"Z\"])),\n Arc::new(Int32Array::from(vec![100, 200, 300])),\n Arc::new(\n FixedSizeListArray::from_iter_primitive::(\n vec![\n Some(vec![Some(0.1_f32); 128]),\n Some(vec![Some(0.2_f32); 128]),\n Some(vec![Some(0.3_f32); 128]),\n ],\n 128,\n ),\n ),\n ],\n)\n.unwrap();\nlet schema_drop_reader = RecordBatchIterator::new(\n vec![Ok(schema_drop_batch)].into_iter(),\n schema_drop_schema.clone(),\n);\nlet schema_drop_table = db\n .create_table(\"schema_evolution_drop_example\", schema_drop_reader)\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; +export const RsUpdateConnectCloud = "let uri = \"db://your-project-slug\";\nlet api_key = \"your-api-key\";\nlet region = \"us-east-1\";\n"; + +export const RsUpdateConnectLocal = "let db = connect(\"./data\").execute().await.unwrap();\n"; + +export const RsUpdateExampleTableSetup = "let table = db\n .create_table(\n \"users_example\",\n make_users_reader(vec![1, 2], vec![\"Alice\", \"Bob\"], Some(vec![10, 20])),\n )\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; + +export const RsUpdateMakeUsersReader = "fn make_users_reader(\n ids: Vec,\n names: Vec<&str>,\n login_counts: Option>,\n) -> Box {\n let mut fields = vec![\n Field::new(\"id\", DataType::Int64, false),\n Field::new(\"name\", DataType::Utf8, false),\n ];\n let mut columns: Vec> =\n vec![Arc::new(Int64Array::from(ids)), Arc::new(StringArray::from(names))];\n\n if let Some(login_counts) = login_counts {\n fields.push(Field::new(\"login_count\", DataType::Int64, true));\n columns.push(Arc::new(Int64Array::from(login_counts)));\n }\n\n let schema = Arc::new(Schema::new(fields));\n let batch = RecordBatch::try_new(schema.clone(), columns).unwrap();\n let reader = RecordBatchIterator::new(vec![Ok(batch)].into_iter(), schema);\n Box::new(reader)\n}\n"; + +export const RsUpdateOperation = "let table = db\n .create_table(\n \"users_example\",\n make_users_reader(vec![1, 2], vec![\"Alice\", \"Bob\"], Some(vec![10, 20])),\n )\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\ntable\n .update()\n .only_if(\"id = 2\")\n .column(\"name\", \"'Bobby'\")\n .execute()\n .await\n .unwrap();\n"; + +export const RsUpdateOptimizeCleanup = "table\n .optimize(OptimizeAction::Prune {\n older_than: Some(Duration::days(1)),\n delete_unverified: None,\n error_if_tagged_old_versions: None,\n })\n .await\n .unwrap();\n"; + +export const RsUpdateUsingSql = "let table = db\n .create_table(\n \"users_example\",\n make_users_reader(vec![1, 2], vec![\"Alice\", \"Bob\"], Some(vec![10, 20])),\n )\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\ntable\n .update()\n .only_if(\"id = 2\")\n .column(\"login_count\", \"login_count + 1\")\n .execute()\n .await\n .unwrap();\n"; + +export const RsVersioningAddData = "let more_data = vec![\n (4, \"Richard Daniel Sanchez\", \"That's the way the news goes!\"),\n (5, \"Morty\", \"Aww geez, Rick!\"),\n];\ntable\n .add(make_quotes_reader(more_data))\n .execute()\n .await\n .unwrap();\n"; + +export const RsVersioningBasicSetup = "let table_name = \"quotes_versioning_example\";\nlet data = vec![\n (1, \"Richard\", \"Wubba Lubba Dub Dub!\"),\n (2, \"Morty\", \"Rick, what's going on?\"),\n (3, \"Richard\", \"I turned myself into a pickle, Morty!\"),\n];\n\nlet table = db\n .create_table(table_name, make_quotes_reader(data))\n .mode(CreateTableMode::Overwrite)\n .execute()\n .await\n .unwrap();\n"; + +export const RsVersioningCheckInitialVersion = "let versions = table.list_versions().await.unwrap();\nlet current_version = table.version().await.unwrap();\nprintln!(\"Number of versions after creation: {}\", versions.len());\nprintln!(\"Current version: {}\", current_version);\n"; + +export const RsVersioningCheckVersionsAfterMod = "let versions_after_mod = table.list_versions().await.unwrap();\nlet version_count_after_mod = versions_after_mod.len();\nlet version_after_mod = table.version().await.unwrap();\nprintln!(\n \"Number of versions after modifications: {}\",\n version_count_after_mod\n);\nprintln!(\"Current version: {}\", version_after_mod);\n"; + +export const RsVersioningCheckoutLatest = "table.checkout_latest().await.unwrap();\n"; + +export const RsVersioningDeleteData = "table.delete(\"author = 'Morty'\").await.unwrap();\nlet rows_after_deletion = table.count_rows(None).await.unwrap();\nprintln!(\"Number of rows after deletion: {}\", rows_after_deletion);\n"; + +export const RsVersioningListAllVersions = "let all_versions = table.list_versions().await.unwrap();\nfor v in &all_versions {\n println!(\"Version {}, created at {}\", v.version, v.timestamp);\n}\n"; + +export const RsVersioningMakeQuotesReader = "fn make_quotes_reader(rows: Vec<(i64, &str, &str)>) -> Box {\n let ids: Vec = rows.iter().map(|(id, _, _)| *id).collect();\n let authors: Vec<&str> = rows.iter().map(|(_, author, _)| *author).collect();\n let quotes: Vec<&str> = rows.iter().map(|(_, _, quote)| *quote).collect();\n\n let schema = Arc::new(Schema::new(vec![\n Field::new(\"id\", DataType::Int64, false),\n Field::new(\"author\", DataType::Utf8, false),\n Field::new(\"quote\", DataType::Utf8, false),\n ]));\n\n let batch = RecordBatch::try_new(\n schema.clone(),\n vec![\n Arc::new(Int64Array::from(ids)),\n Arc::new(StringArray::from(authors)),\n Arc::new(StringArray::from(quotes)),\n ],\n )\n .unwrap();\n let reader = RecordBatchIterator::new(vec![Ok(batch)].into_iter(), schema);\n Box::new(reader)\n}\n"; + +export const RsVersioningRollback = "table.checkout(version_after_mod).await.unwrap();\ntable.restore().await.unwrap();\nlet versions_after_rollback = table.list_versions().await.unwrap();\nlet version_count_after_rollback = versions_after_rollback.len();\nprintln!(\n \"Total number of versions after rollback: {}\",\n version_count_after_rollback\n);\n"; + +export const RsVersioningUpdateData = "table\n .update()\n .only_if(\"author = 'Richard'\")\n .column(\"author\", \"'Richard Daniel Sanchez'\")\n .execute()\n .await\n .unwrap();\nlet rows_after_update = table\n .count_rows(Some(\"author = 'Richard Daniel Sanchez'\".to_string()))\n .await\n .unwrap();\nprintln!(\n \"Rows updated to Richard Daniel Sanchez: {}\",\n rows_after_update\n);\n"; + diff --git a/docs/tables/consistency.mdx b/docs/tables/consistency.mdx index 95f638b..fa248a4 100644 --- a/docs/tables/consistency.mdx +++ b/docs/tables/consistency.mdx @@ -6,24 +6,38 @@ icon: "water" --- import { PyConsistencyStrong as ConsistencyStrong, + TsConsistencyStrong as TsConsistencyStrong, + RsConsistencyStrong as RsConsistencyStrong, PyConsistencyEventual as ConsistencyEventual, + TsConsistencyEventual as TsConsistencyEventual, + RsConsistencyEventual as RsConsistencyEventual, PyConsistencyCheckoutLatest as ConsistencyCheckoutLatest, + TsConsistencyCheckoutLatest as TsConsistencyCheckoutLatest, + RsConsistencyCheckoutLatest as RsConsistencyCheckoutLatest, + RsUpdateMakeUsersReader as RsConsistencyMakeUsersReader, } from '/snippets/tables.mdx'; -You can set the `read_consistency_interval` parameter on connections to achieve different levels of read consistency. This parameter determines how frequently the database synchronizes with the underlying storage system to check for updates made by other processes. If another process updates a table, the database will not see the changes until the next synchronization. +You can set `read_consistency_interval` on the connection to control how often reads check for updates from other writers. -There are three possible settings for `read_consistency_interval`: +There are three possible settings for `read_consistency_interval` when working with [LanceTable](/tables-and-namespaces#understanding-tables): -1. **Unset (default)**: The database does not check for updates to tables made by other processes. This provides the best query performance, but means that clients may not see the most up-to-date data. This setting is suitable for applications where the data does not change during the lifetime of the table reference. -2. **Zero seconds (Strong consistency)**: The database checks for updates on every read. This provides the strongest consistency guarantees, ensuring that all clients see the latest committed data. However, it has the most overhead. This setting is suitable when consistency matters more than having high QPS. -3. **Custom interval (Eventual consistency)**: The database checks for updates at a custom interval, such as every 5 seconds. This provides eventual consistency, allowing for some lag between write and read operations. Performance-wise, this is a middle ground between strong consistency and no consistency check. This setting is suitable for applications where immediate consistency is not critical, but clients should see updated data eventually. +1. **Unset (default)**: no automatic cross-process refresh checks. +2. **Zero seconds**: check for updates on every read (strongest freshness). +3. **Non-zero interval**: check for updates after the interval elapses (eventual refresh). - -In LanceDB Enterprise, read consistency are tunable via the configuration -settings. In LanceDB Cloud, readers are always strongly consistent. - +The value you set depends on your application's consistency needs and performance requirements. +For example, a real-time dashboard might require strong consistency, while a batch analytics job might be +fine with eventual consistency. -## Configuring Consistency Parameters + +**Consistency in Remote Tables** + +In LanceDB Enterprise (where you use `RemoteTable`), consistency is deployment-configured +(via a `weak_read_consistency_interval_seconds` parameter in the cluster setup), and is not an end-user setting in the SDK. +You can still use `checkout_latest` / `checkoutLatest` for explicit manual refresh. + + +## Configure Consistency Parameters To set strong consistency, set the interval to 0: @@ -31,26 +45,63 @@ To set strong consistency, set the interval to 0: {ConsistencyStrong} + + + {TsConsistencyStrong} + + + + {RsConsistencyStrong} + + + + + + + {RsConsistencyMakeUsersReader} + + -For eventual consistency, use a custom interval: +For eventual consistency, use a non-zero interval: {ConsistencyEventual} + + + {TsConsistencyEventual} + + + + {RsConsistencyEventual} + -By default, a `Table` will never check for updates from other writers. To manually check for updates you can use `checkout_latest`: +With the default unset interval, tables do not auto-refresh from other writers. +To manually check for updates, use `checkout_latest` / `checkoutLatest`: {ConsistencyCheckoutLatest} + + + {TsConsistencyCheckoutLatest} + + + + {RsConsistencyCheckoutLatest} + -## Handling bad vectors +## Handle bad vectors + + +This section is currently specific to the Python SDK. + In LanceDB Python, you can use the `on_bad_vectors` parameter to choose how invalid vector values are handled. Invalid vectors are vectors that are not valid diff --git a/docs/tables/create.mdx b/docs/tables/create.mdx index b80950a..95c3699 100644 --- a/docs/tables/create.mdx +++ b/docs/tables/create.mdx @@ -217,7 +217,7 @@ for a `created_at` field. When you run this code it, should raise the `ValidationError`. -### Using Iterators / Writing Large Datasets +### Use Iterators / Write Large Datasets For large ingests, prefer batching instead of adding one row at a time. Python and Rust can create a table directly from Arrow batch iterators or readers. In TypeScript, the practical pattern today is to create an empty table and append Arrow batches in chunks. @@ -255,7 +255,7 @@ If you forget the name of your table, you can always get a listing of all table -## Creating empty table +## Create empty table You can create an empty table for scenarios where you want to add data to the table later. An example would be when you want to collect data from a stream/external file and then add it to a table in batches. diff --git a/docs/tables/multimodal.mdx b/docs/tables/multimodal.mdx index 63f0f0a..ec32440 100644 --- a/docs/tables/multimodal.mdx +++ b/docs/tables/multimodal.mdx @@ -8,34 +8,58 @@ keywords: ["blob", "large binary", "blobs", "multimodal"] import { PyMultimodalImports as MultimodalImports, + TsMultimodalImports as TsMultimodalImports, + RsMultimodalImports as RsMultimodalImports, PyCreateDummyData as CreateDummyData, + TsCreateDummyData as TsCreateDummyData, + RsCreateDummyData as RsCreateDummyData, PyDefineSchema as DefineSchema, + TsDefineSchema as TsDefineSchema, + RsDefineSchema as RsDefineSchema, PyIngestData as IngestData, + TsIngestData as TsIngestData, + RsIngestData as RsIngestData, PySearchData as SearchData, + TsSearchData as TsSearchData, + RsSearchData as RsSearchData, PyProcessResults as ProcessResults, + TsProcessResults as TsProcessResults, + RsProcessResults as RsProcessResults, PyBlobApiSchema as BlobApiSchema, + TsBlobApiSchema as TsBlobApiSchema, + RsBlobApiSchema as RsBlobApiSchema, PyBlobApiIngest as BlobApiIngest, + TsBlobApiIngest as TsBlobApiIngest, + RsBlobApiIngest as RsBlobApiIngest, } from '/snippets/multimodal.mdx'; LanceDB handles multimodal data—images, audio, video, and PDF files—natively by storing the raw bytes in a binary column alongside your vectors and metadata. This approach simplifies your data infrastructure by keeping the raw assets and their embeddings in the same database, eliminating the need for separate object storage for many use cases. This guide demonstrates how to ingest, store, and retrieve image data using standard binary columns, and also introduces the **Lance Blob API** for optimized handling of larger multimodal files. -## Storing binary data +## Store binary data -To store binary data, you need to use the `pa.binary()` data type in your Arrow schema. In Python, this corresponds to `bytes` objects if you're using LanceDB's Pydantic `LanceModel` to define the schema. +To store binary data, define a binary Arrow field in your schema (`pa.binary()` in Python, `Binary` in TypeScript, and `DataType::Binary` in Rust). ### 1. Setup and imports -First, let's import the necessary libraries. We'll use `PIL` (Pillow) for image handling and `io` for byte conversion. +First, import the necessary libraries for LanceDB and Arrow in your SDK. {MultimodalImports} + + + {TsMultimodalImports} + + + + {RsMultimodalImports} + -### 2. Preparing data +### 2. Prepare data For this example, we'll create some dummy in-memory images. In a real application, you would read these from files or an API. The key is to convert your data (image, audio, etc.) into a raw `bytes` object. @@ -43,9 +67,17 @@ For this example, we'll create some dummy in-memory images. In a real applicatio {CreateDummyData} + + + {TsCreateDummyData} + + + + {RsCreateDummyData} + -### 3. Defining the schema +### 3. Define the schema When creating the table, it is **highly recommended** to define the schema explicitly. This ensures that your binary data is correctly interpreted as a `binary` type by Arrow/LanceDB and not as a generic string or list. @@ -53,9 +85,17 @@ When creating the table, it is **highly recommended** to define the schema expli {DefineSchema} + + + {TsDefineSchema} + + + + {RsDefineSchema} + -### 4. Ingesting data +### 4. Ingest data Now, create the table using the data and the defined schema. @@ -63,9 +103,17 @@ Now, create the table using the data and the defined schema. {IngestData} + + + {TsIngestData} + + + + {RsIngestData} + -## Retrieving and using blobs +## Retrieve and use blobs When you search your LanceDB table, you can retrieve the binary column just like any other metadata. @@ -73,23 +121,39 @@ When you search your LanceDB table, you can retrieve the binary column just like {SearchData} + + + {TsSearchData} + + + + {RsSearchData} + -### Converting bytes back to objects +### Convert bytes back to objects -Once you have the `bytes` data back from the search result, you can decode it back into its original format (e.g., a PIL Image, an Audio buffer, etc.). +Once you have the bytes back from the search result, you can decode them into the original format (for example, an image object or audio buffer). {ProcessResults} + + + {TsProcessResults} + + + + {RsProcessResults} + ## Large Blobs (Blob API) -For larger files like high-resolution images or videos, Lance provides a specialized **Blob API**. By using `pa.large_binary()` and specific metadata, you enable **lazy loading** and optimized encoding. This allows you to work with massive datasets without loading all binary data into memory upfront. +For larger files like high-resolution images or videos, Lance provides a specialized **Blob API**. By using a large-binary Arrow type (`pa.large_binary()` in Python, `LargeBinary` in TypeScript, and `DataType::LargeBinary` in Rust) and specific metadata, you enable **lazy loading** and optimized encoding. This allows you to work with massive datasets without loading all binary data into memory upfront. -### 1. Defining a blob schema +### 1. Define a blob schema To use the Blob API, you must mark the column with `{"lance-encoding:blob": "true"}` metadata. @@ -97,9 +161,17 @@ To use the Blob API, you must mark the column with `{"lance-encoding:blob": "tru {BlobApiSchema} + + + {TsBlobApiSchema} + + + + {RsBlobApiSchema} + -### 2. Ingesting large blobs +### 2. Ingest large blobs You can then ingest data normally, and Lance will handle the optimized storage. @@ -107,6 +179,14 @@ You can then ingest data normally, and Lance will handle the optimized storage. {BlobApiIngest} + + + {TsBlobApiIngest} + + + + {RsBlobApiIngest} + diff --git a/docs/tables/update.mdx b/docs/tables/update.mdx index 3addb71..355f4e4 100644 --- a/docs/tables/update.mdx +++ b/docs/tables/update.mdx @@ -17,6 +17,31 @@ import { PyMergePartialColumns as MergePartialColumns, PyDeleteOperation as DeleteOperation, PyUpdateOptimizeCleanup as UpdateOptimizeCleanup, + TsUpdateConnectCloud, + TsUpdateConnectLocal, + TsUpdateExampleTableSetup, + TsUpdateOperation, + TsUpdateUsingSql, + TsMergeMatchedUpdateOnly, + TsInsertIfNotExists, + TsMergeUpdateInsert, + TsMergeDeleteMissingBySource, + TsMergePartialColumns, + TsDeleteOperation, + TsUpdateOptimizeCleanup, + RsUpdateConnectCloud, + RsUpdateConnectLocal, + RsUpdateExampleTableSetup, + RsUpdateMakeUsersReader, + RsUpdateOperation, + RsUpdateUsingSql, + RsMergeMatchedUpdateOnly, + RsInsertIfNotExists, + RsMergeUpdateInsert, + RsMergeDeleteMissingBySource, + RsMergePartialColumns, + RsDeleteOperation, + RsUpdateOptimizeCleanup, } from '/snippets/tables.mdx'; Updating or modifying data involves changing rows in an existing table. @@ -37,13 +62,15 @@ Connect to your local LanceDB instance: {UpdateConnectLocal} - -Expected output: + + {TsUpdateConnectLocal} + -| Variable | Value | -| --- | --- | -| `db` | A connected LanceDB database handle pointing to `./data` | + + {RsUpdateConnectLocal} + + Or, connect to your LanceDB remote cluster: @@ -51,13 +78,24 @@ Or, connect to your LanceDB remote cluster: {UpdateConnectCloud} + + + {TsUpdateConnectCloud} + + + + {RsUpdateConnectCloud} + -Expected output: + -| Variable | Value | -| --- | --- | -| `db` | A connected LanceDB database handle for your remote project | + + + {RsUpdateMakeUsersReader} + + + ## Create the example table @@ -66,6 +104,14 @@ We'll start by creating a simple table of a table with `id`, `name`, and `login_ {UpdateExampleTableSetup} + + + {TsUpdateExampleTableSetup} + + + + {RsUpdateExampleTableSetup} + Expected table contents: @@ -96,6 +142,14 @@ Use `update` when you already know which target rows to modify and you do not ne {UpdateOperation} + + + {TsUpdateOperation} + + + + {RsUpdateOperation} + Expected table contents: @@ -117,6 +171,14 @@ Use `values_sql` when you want to use SQL-like expressions to update rows. This {UpdateUsingSql} + + + {TsUpdateUsingSql} + + + + {RsUpdateUsingSql} + Expected table contents: @@ -167,6 +229,14 @@ This updates keys that already exist in the target table. Source rows with new k {MergeMatchedUpdateOnly} + + + {TsMergeMatchedUpdateOnly} + + + + {RsMergeMatchedUpdateOnly} + Expected table contents: @@ -184,6 +254,14 @@ This inserts only brand-new keys from the source. Existing keys are left unchang {InsertIfNotExists} + + + {TsInsertIfNotExists} + + + + {RsInsertIfNotExists} + Expected table contents: @@ -206,6 +284,14 @@ This is a conventional **upsert**. {MergeUpdateInsert} + + + {TsMergeUpdateInsert} + + + + {RsMergeUpdateInsert} + Expected table contents: @@ -224,6 +310,14 @@ Use `when_not_matched_by_source_delete()` when you want to remove any target row {MergeDeleteMissingBySource} + + + {TsMergeDeleteMissingBySource} + + + + {RsMergeDeleteMissingBySource} + Expected table contents: @@ -243,6 +337,14 @@ Merge updates do not require you to provide values for all columns. You can prov {MergePartialColumns} + + + {TsMergePartialColumns} + + + + {RsMergePartialColumns} + Expected table contents: @@ -265,6 +367,14 @@ for deletion (in the [deletion files](https://lance.org/format/table/#deletion-f {DeleteOperation} + + + {TsDeleteOperation} + + + + {RsDeleteOperation} + Expected table contents: @@ -293,4 +403,12 @@ By default, table cleanup removes data up to 7 days ago. If you need to reclaim {UpdateOptimizeCleanup} + + + {TsUpdateOptimizeCleanup} + + + + {RsUpdateOptimizeCleanup} + diff --git a/docs/tables/versioning.mdx b/docs/tables/versioning.mdx index d3dd737..4d0fc69 100644 --- a/docs/tables/versioning.mdx +++ b/docs/tables/versioning.mdx @@ -6,31 +6,43 @@ icon: "clock" --- import { PyVersioningBasicSetup as VersioningBasicSetup, + TsVersioningBasicSetup as TsVersioningBasicSetup, + RsVersioningBasicSetup as RsVersioningBasicSetup, PyVersioningCheckInitialVersion as VersioningCheckInitialVersion, + TsVersioningCheckInitialVersion as TsVersioningCheckInitialVersion, + RsVersioningCheckInitialVersion as RsVersioningCheckInitialVersion, PyVersioningUpdateData as VersioningUpdateData, + TsVersioningUpdateData as TsVersioningUpdateData, + RsVersioningUpdateData as RsVersioningUpdateData, PyVersioningAddData as VersioningAddData, + TsVersioningAddData as TsVersioningAddData, + RsVersioningAddData as RsVersioningAddData, PyVersioningCheckVersionsAfterMod as VersioningCheckVersionsAfterMod, + TsVersioningCheckVersionsAfterMod as TsVersioningCheckVersionsAfterMod, + RsVersioningCheckVersionsAfterMod as RsVersioningCheckVersionsAfterMod, PyVersioningListAllVersions as VersioningListAllVersions, + TsVersioningListAllVersions as TsVersioningListAllVersions, + RsVersioningListAllVersions as RsVersioningListAllVersions, PyVersioningRollback as VersioningRollback, + TsVersioningRollback as TsVersioningRollback, + RsVersioningRollback as RsVersioningRollback, PyVersioningCheckoutLatest as VersioningCheckoutLatest, + TsVersioningCheckoutLatest as TsVersioningCheckoutLatest, + RsVersioningCheckoutLatest as RsVersioningCheckoutLatest, PyVersioningDeleteData as VersioningDeleteData, + TsVersioningDeleteData as TsVersioningDeleteData, + RsVersioningDeleteData as RsVersioningDeleteData, + RsVersioningMakeQuotesReader as RsVersioningMakeQuotesReader, } from '/snippets/tables.mdx'; -LanceDB redefines data management for AI/ML workflows with built-in, -automatic versioning powered by the [Lance columnar format](https://github.com/lancedb/lance). -Every table mutation—appends, updates, deletions, or schema changes — is tracked with -zero configuration, enabling: - -- Time-Travel Debugging: Pinpoint production issues by querying historical table states. -- Atomic Rollbacks: Revert terabyte-scale datasets to any prior version in seconds. -- ML Reproducibility: Exactly reproduce training snapshots (vectors + metadata). -- Branching Workflows: Conduct A/B tests on embeddings/models via lightweight table clones. +This page shows the core table-versioning APIs used in the code snippets for Python, TypeScript, and Rust. +Each operation below maps directly to methods shown in the examples. ## Basic Versioning Example Let's create a table with sample data to demonstrate LanceDB's versioning capabilities: -### Setting Up the Table +### Set Up the Table First, let's create a table with some sample data: @@ -38,9 +50,25 @@ First, let's create a table with some sample data: {VersioningBasicSetup} + + + {TsVersioningBasicSetup} + + + + {RsVersioningBasicSetup} + -### Checking Initial Version + + + + {RsVersioningMakeQuotesReader} + + + + +### Check Initial Version After creating the table, let's check the initial version information: @@ -48,13 +76,21 @@ After creating the table, let's check the initial version information: {VersioningCheckInitialVersion} + + + {TsVersioningCheckInitialVersion} + + + + {RsVersioningCheckInitialVersion} + -## Modifying Data +## Modify Data When you modify data through operations like update or delete, LanceDB automatically creates new versions. -### Updating Existing Data +### Update Existing Data Let's update some existing records to see versioning in action: @@ -62,9 +98,17 @@ Let's update some existing records to see versioning in action: {VersioningUpdateData} + + + {TsVersioningUpdateData} + + + + {RsVersioningUpdateData} + -### Adding New Data +### Add New Data Now let's add more records to the table: @@ -72,9 +116,17 @@ Now let's add more records to the table: {VersioningAddData} + + + {TsVersioningAddData} + + + + {RsVersioningAddData} + -### Checking Version Changes +### Check Version Changes Let's see how the versions have changed after our modifications: @@ -82,82 +134,21 @@ Let's see how the versions have changed after our modifications: {VersioningCheckVersionsAfterMod} - - -## Tracking Changes in Schema - -LanceDB's versioning system automatically tracks every schema modification. This is critical when handling evolving embedding models. For example, adding a new `vector_minilm` column creates a fresh version, enabling seamless A/B testing between embedding generations without recreating the table. - -### Preparing Data for Embeddings - -First, let's get the data we want to embed: - -```python -import pyarrow as pa - -# Get data from table -df = table.search().limit(5).to_pandas() -``` - -### Generating Embeddings - -Now let's generate embeddings using the all-MiniLM-L6-v2 model: - -```python -# Let's use "all-MiniLM-L6-v2" model to embed the quotes -model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu") - -# Generate embeddings for each quote and pair with IDs -vectors = model.encode( - df["quote"].tolist(), convert_to_numpy=True, normalize_embeddings=True -) -vector_dim = vectors[0].shape[0] -print(f"Vector dimension: {vector_dim}") - -# Add IDs to vectors array with proper column names -vectors_with_ids = [ - {"id": i + 1, "vector_minilm": vec.tolist()} for i, vec in enumerate(vectors) -] -``` - -### Adding Vector Column to Schema - -Now let's add the vector column to our table schema: - -```python -# Add vector column and merge data -table.add_columns( - {"vector_minilm": f"arrow_cast(NULL, 'FixedSizeList({vector_dim}, Float32)')"} -) - -table.merge_insert( - "id" -).when_matched_update_all().when_not_matched_insert_all().execute(vectors_with_ids) -``` -### Checking Version Changes After Schema Modification - -Let's see how the schema change affected our versioning: - -```python -# Check versions after schema change -versions = table.list_versions() -version_count_after_embed = len(versions) -version_after_embed = table.version -print(f"Number of versions after adding embeddings: {version_count_after_embed}") -print(f"Current version: {version_after_embed}") + + {TsVersioningCheckVersionsAfterMod} + -# Verify the schema change -# The table should now include a vector_minilm column containing -# embeddings generated by the all-MiniLM-L6-v2 model -print(table.schema) -``` + + {RsVersioningCheckVersionsAfterMod} + + ## Rollback to Previous Versions LanceDB supports fast rollbacks to any previous version without data duplication. -### Viewing All Versions +### View All Versions First, let's see all the versions we've created: @@ -165,82 +156,39 @@ First, let's see all the versions we've created: {VersioningListAllVersions} + + + {TsVersioningListAllVersions} + + + + {RsVersioningListAllVersions} + -### Rolling Back to a Previous Version +### Restore a Version Snapshot -Now let's roll back to before we added the vector column: +Now let's restore a captured version snapshot: {VersioningRollback} - - -## Making Changes from Previous Versions - -After restoring a table to an earlier version, you can continue making modifications. In this example, we rolled back to a version before adding embeddings. This allows us to experiment with different embedding models and compare their performance. - -### Switching to a Different Embedding Model - -Let's try a different embedding model (all-mpnet-base-v2) to see how it performs: - -```python -# Let's switch to the all-mpnet-base-v2 model to embed the quotes -model = SentenceTransformer("all-mpnet-base-v2", device="cpu") - -# Generate embeddings for each quote and pair with IDs -vectors = model.encode( - df["quote"].tolist(), convert_to_numpy=True, normalize_embeddings=True -) -vector_dim = vectors[0].shape[0] -print(f"Vector dimension: {vector_dim}") - -# Add IDs to vectors array with proper column names -vectors_with_ids = [ - {"id": i + 1, "vector_mpnet": vec.tolist()} for i, vec in enumerate(vectors) -] -``` - -### Adding the New Vector Column -Now let's add the new vector column with the different model: - -```python -# Add vector column and merge data -table.add_columns( - {"vector_mpnet": f"arrow_cast(NULL, 'FixedSizeList({vector_dim}, Float32)')"} -) - -table.merge_insert( - "id" -).when_matched_update_all().when_not_matched_insert_all().execute(vectors_with_ids) -``` - -### Checking Version Changes - -Let's see how this new model affects our versioning: - -```python -# Check versions after schema change -versions = table.list_versions() -version_count_after_alter_embed = len(versions) -version_after_alter_embed = table.version -print( - f"Number of versions after switching model: {version_count_after_alter_embed}" -) -print(f"Current version: {version_after_alter_embed}") + + {TsVersioningRollback} + -# The table should now include a vector_mpnet column containing -# embeddings generated by the all-mpnet-base-v2 model -print(table.schema) -``` + + {RsVersioningRollback} + + ## Delete Data From the Table Let's demonstrate how deletions also create new versions: -### Going Back to Latest Version +### Go Back to Latest Version First, let's return to the latest version: @@ -248,9 +196,17 @@ First, let's return to the latest version: {VersioningCheckoutLatest} + + + {TsVersioningCheckoutLatest} + + + + {RsVersioningCheckoutLatest} + -### Deleting Data +### Delete Data Now let's delete some data to see how it affects versioning: @@ -258,29 +214,27 @@ Now let's delete some data to see how it affects versioning: {VersioningDeleteData} + + + {TsVersioningDeleteData} + + + + {RsVersioningDeleteData} + ### Version History and Operations -Throughout this guide, we've demonstrated various operations that create new versions in LanceDB. -Here's a summary of the version history we created: - -1. **Initial Creation** (v1): Created table with quotes data and basic schema -2. **First Update** (v2): Changed "Richard" to "Richard Daniel Sanchez" -3. **Data Append** (v3): Added new quotes from both characters -4. **Schema Evolution** (v4): Added `vector_minilm` column for embeddings -5. **Embedding Merge** (v5): Populated `vector_minilm` with embeddings -6. **Version Rollback** (v6): Restored to v3 (pre-vector state) -7. **Alternative Schema** (v7): Added `vector_mpnet` column -8. **Alternative Merge** (v8): Populated `vector_mpnet` embeddings -9. **Data Cleanup** (v9): Kept only Richard Daniel Sanchez quotes - -Each version represents a distinct state of your data, allowing you to: - -- Track changes over time -- Compare different embedding strategies -- Revert to previous states -- Maintain data lineage for ML reproducibility +On a fresh table, the snippets in this guide produce this version sequence: + +1. `v1`: create table (`create_table` / `createTable` / `create_table`) +2. `v2`: update rows (`update`) +3. `v3`: add rows (`add`) +4. `v4`: restore snapshot (`restore`) from `version_after_mod`/`versionAfterMod` +5. `v5`: delete rows (`delete`) + +Read-only and checkout operations shown here (`list_versions`/`listVersions`, `version`, `checkout`, `checkout_latest`/`checkoutLatest`) do not create new versions. **System Operations** diff --git a/tests/py/test_basic_usage.py b/tests/py/test_basic_usage.py index 52e80af..a010c61 100644 --- a/tests/py/test_basic_usage.py +++ b/tests/py/test_basic_usage.py @@ -69,7 +69,7 @@ def test_basic_usage(db_path_factory): ) db.create_table("camelot_pa", schema=schema, mode="overwrite") # --8<-- [end:basic_create_empty_table] - assert "camelot_pa" in db.table_names() + assert "camelot_pa" in db.list_tables().tables db.drop_table("camelot_pa") # --8<-- [start:basic_add_data] @@ -168,7 +168,7 @@ def test_basic_usage(db_path_factory): # --8<-- [start:basic_drop_table] db.drop_table("camelot") # --8<-- [end:basic_drop_table] - assert "camelot" not in db.table_names() + assert "camelot" not in db.list_tables().tables @pytest.mark.asyncio diff --git a/tests/py/test_tables.py b/tests/py/test_tables.py index 4fcf5ed..394a7f2 100644 --- a/tests/py/test_tables.py +++ b/tests/py/test_tables.py @@ -306,7 +306,7 @@ def test_open_existing_table(tmp_db): db.create_table("test_table", data, mode="overwrite") # List table names - print(db.table_names()) + print(db.list_tables().tables) # Open existing table tbl = db.open_table("test_table") @@ -1084,7 +1084,15 @@ def _setup_versioning_table(tmp_db, data=None, table_name="quotes_versioning_exa import pyarrow as pa if data is None: - data = [{"id": 1, "author": "Richard", "quote": "Wubba Lubba Dub Dub!"}] + data = [ + {"id": 1, "author": "Richard", "quote": "Wubba Lubba Dub Dub!"}, + {"id": 2, "author": "Morty", "quote": "Rick, what's going on?"}, + { + "id": 3, + "author": "Richard", + "quote": "I turned myself into a pickle, Morty!", + }, + ] schema = pa.schema( [ pa.field("id", pa.int64()), @@ -1097,15 +1105,10 @@ def _setup_versioning_table(tmp_db, data=None, table_name="quotes_versioning_exa def test_versioning_basic_setup(tmp_db): # --8<-- [start:versioning_basic_setup] - import lancedb - import numpy as np - import pandas as pd import pyarrow as pa - # Connect to LanceDB db = tmp_db - # Create a table with initial data table_name = "quotes_versioning_example" data = [ {"id": 1, "author": "Richard", "quote": "Wubba Lubba Dub Dub!"}, @@ -1128,34 +1131,33 @@ def test_versioning_basic_setup(tmp_db): table = db.create_table(table_name, data, schema=schema, mode="overwrite") # --8<-- [end:versioning_basic_setup] + assert table.count_rows() == 3 def test_versioning_check_initial_version(tmp_db): table = _setup_versioning_table(tmp_db) # --8<-- [start:versioning_check_initial_version] - # View the initial version versions = table.list_versions() + current_version = table.version print(f"Number of versions after creation: {len(versions)}") - print(f"Current version: {table.version}") + print(f"Current version: {current_version}") # --8<-- [end:versioning_check_initial_version] assert len(versions) == 1 - assert table.version == versions[-1]["version"] + assert current_version == versions[-1]["version"] def test_versioning_flow(tmp_db): table = _setup_versioning_table(tmp_db) # --8<-- [start:versioning_update_data] - # Update author names to be more specific table.update(where="author='Richard'", values={"author": "Richard Daniel Sanchez"}) - rows_after_update = table.count_rows() - print(f"Number of rows after update: {rows_after_update}") + rows_after_update = table.count_rows("author = 'Richard Daniel Sanchez'") + print(f"Rows updated to Richard Daniel Sanchez: {rows_after_update}") # --8<-- [end:versioning_update_data] - assert rows_after_update == 1 + assert rows_after_update == 2 # --8<-- [start:versioning_add_data] - # Add more data more_data = [ { "id": 4, @@ -1166,22 +1168,19 @@ def test_versioning_flow(tmp_db): ] table.add(more_data) # --8<-- [end:versioning_add_data] - assert table.count_rows() == 3 + assert table.count_rows() == 5 # --8<-- [start:versioning_check_versions_after_mod] - # Check versions after modifications versions = table.list_versions() version_count_after_mod = len(versions) version_after_mod = table.version print(f"Number of versions after modifications: {version_count_after_mod}") print(f"Current version: {version_after_mod}") # --8<-- [end:versioning_check_versions_after_mod] - assert version_count_after_mod == len(versions) assert version_count_after_mod >= 2 assert version_after_mod == versions[-1]["version"] # --8<-- [start:versioning_list_all_versions] - # Let's see all versions versions = table.list_versions() for v in versions: print(f"Version {v['version']}, created at {v['timestamp']}") @@ -1189,32 +1188,26 @@ def test_versioning_flow(tmp_db): assert len(versions) >= 1 # --8<-- [start:versioning_rollback] - # Let's roll back to before we added the vector column - # We'll use the version after modifications but before adding embeddings table.restore(version_after_mod) - - # Notice we have one more version now, not less! versions = table.list_versions() version_count_after_rollback = len(versions) print(f"Total number of versions after rollback: {version_count_after_rollback}") # --8<-- [end:versioning_rollback] - assert version_count_after_rollback == len(versions) assert version_count_after_rollback == version_count_after_mod + 1 assert table.version == versions[-1]["version"] + assert table.count_rows() == 5 # --8<-- [start:versioning_checkout_latest] - # Go back to the latest version table.checkout_latest() # --8<-- [end:versioning_checkout_latest] assert table.version == table.list_versions()[-1]["version"] # --8<-- [start:versioning_delete_data] - # Let's delete data from the table - table.delete("author != 'Richard Daniel Sanchez'") + table.delete("author = 'Morty'") rows_after_deletion = table.count_rows() print(f"Number of rows after deletion: {rows_after_deletion}") # --8<-- [end:versioning_delete_data] - assert rows_after_deletion == 2 + assert rows_after_deletion == 3 # ============================================================================ @@ -1226,37 +1219,49 @@ def test_consistency_strong(tmp_db): # --8<-- [start:consistency_strong] from datetime import timedelta - uri = str(tmp_db.uri) if hasattr(tmp_db, "uri") else "memory://" - db = lancedb.connect(uri, read_consistency_interval=timedelta(0)) - # Create table first - data = [{"vector": [1.1, 1.2], "lat": 45.5}] - db.create_table("test_table", data, mode="overwrite") - tbl = db.open_table("test_table") + uri = str(tmp_db.uri) + writer_db = lancedb.connect(uri) + reader_db = lancedb.connect(uri, read_consistency_interval=timedelta(0)) + writer_table = writer_db.create_table("consistency_strong_table", [{"id": 1}], mode="overwrite") + reader_table = reader_db.open_table("consistency_strong_table") + writer_table.add([{"id": 2}]) + rows_after_write = reader_table.count_rows() + print(f"Rows visible with strong consistency: {rows_after_write}") # --8<-- [end:consistency_strong] + assert rows_after_write == 2 def test_consistency_eventual(tmp_db): # --8<-- [start:consistency_eventual] from datetime import timedelta - uri = str(tmp_db.uri) if hasattr(tmp_db, "uri") else "memory://" - db = lancedb.connect(uri, read_consistency_interval=timedelta(seconds=5)) - # Create table first - data = [{"vector": [1.1, 1.2], "lat": 45.5}] - db.create_table("test_table", data, mode="overwrite") - tbl = db.open_table("test_table") + uri = str(tmp_db.uri) + writer_db = lancedb.connect(uri) + reader_db = lancedb.connect(uri, read_consistency_interval=timedelta(seconds=3600)) + writer_table = writer_db.create_table("consistency_eventual_table", [{"id": 1}], mode="overwrite") + reader_table = reader_db.open_table("consistency_eventual_table") + writer_table.add([{"id": 2}]) + rows_after_write = reader_table.count_rows() + print(f"Rows visible before eventual refresh interval: {rows_after_write}") # --8<-- [end:consistency_eventual] + assert rows_after_write == 1 def test_consistency_checkout_latest(tmp_db): # --8<-- [start:consistency_checkout_latest] - db = tmp_db - # Create table first - data = [{"vector": [1.1, 1.2], "lat": 45.5}] - tbl = db.create_table("test_table", data, mode="overwrite") - - # (Other writes happen to my_table from another process) - - # Check for updates - tbl.checkout_latest() + uri = str(tmp_db.uri) + writer_db = lancedb.connect(uri) + reader_db = lancedb.connect(uri) + writer_table = writer_db.create_table("consistency_checkout_latest_table", [{"id": 1}], mode="overwrite") + reader_table = reader_db.open_table("consistency_checkout_latest_table") + + writer_table.add([{"id": 2}]) + rows_before_refresh = reader_table.count_rows() + print(f"Rows before checkout_latest: {rows_before_refresh}") + + reader_table.checkout_latest() + rows_after_refresh = reader_table.count_rows() + print(f"Rows after checkout_latest: {rows_after_refresh}") # --8<-- [end:consistency_checkout_latest] + assert rows_before_refresh == 1 + assert rows_after_refresh == 2 diff --git a/tests/rs/Cargo.toml b/tests/rs/Cargo.toml index 10f7537..5dc7d04 100644 --- a/tests/rs/Cargo.toml +++ b/tests/rs/Cargo.toml @@ -32,3 +32,7 @@ path = "quickstart.rs" [[example]] name = "tables" path = "tables.rs" + +[[example]] +name = "multimodal" +path = "multimodal.rs" diff --git a/tests/rs/multimodal.rs b/tests/rs/multimodal.rs new file mode 100644 index 0000000..6c2c2f3 --- /dev/null +++ b/tests/rs/multimodal.rs @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The LanceDB Authors + +// --8<-- [start:multimodal_imports] +use std::collections::HashMap; +use std::sync::Arc; + +use arrow_array::types::Float32Type; +use arrow_array::{ + BinaryArray, FixedSizeListArray, Int32Array, Int64Array, LargeBinaryArray, RecordBatch, + RecordBatchIterator, StringArray, +}; +use arrow_schema::{DataType, Field, Schema}; +use futures_util::TryStreamExt; +use lancedb::connect; +use lancedb::database::CreateTableMode; +use lancedb::query::{ExecutableQuery, QueryBase}; +// --8<-- [end:multimodal_imports] + +#[tokio::main] +async fn main() { + let temp_dir = tempfile::tempdir().unwrap(); + let db_uri = temp_dir.path().to_str().unwrap().to_string(); + let db = connect(&db_uri).execute().await.unwrap(); + + // --8<-- [start:create_dummy_data] + let create_dummy_image = |color: u8| -> Vec { + let mut png_like = vec![137, 80, 78, 71, 13, 10, 26, 10]; + png_like.push(color); + png_like + }; + + let data = vec![ + ( + 1_i32, + "red_square.png", + vec![0.1_f32; 128], + create_dummy_image(1), + "red", + ), + ( + 2_i32, + "blue_square.png", + vec![0.2_f32; 128], + create_dummy_image(2), + "blue", + ), + ]; + // --8<-- [end:create_dummy_data] + + // --8<-- [start:define_schema] + let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("filename", DataType::Utf8, false), + Field::new( + "vector", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 128), + false, + ), + Field::new("image_blob", DataType::Binary, false), + Field::new("label", DataType::Utf8, false), + ]); + // --8<-- [end:define_schema] + + // --8<-- [start:ingest_data] + let schema = Arc::new(schema); + let image_batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values(data.iter().map(|row| row.0))), + Arc::new(StringArray::from_iter_values(data.iter().map(|row| row.1))), + Arc::new( + FixedSizeListArray::from_iter_primitive::( + data.iter() + .map(|row| Some(row.2.iter().copied().map(Some).collect::>())), + 128, + ), + ), + Arc::new(BinaryArray::from_iter_values( + data.iter().map(|row| row.3.as_slice()), + )), + Arc::new(StringArray::from_iter_values(data.iter().map(|row| row.4))), + ], + ) + .unwrap(); + let image_reader = RecordBatchIterator::new(vec![Ok(image_batch)].into_iter(), schema.clone()); + let table = db + .create_table("images", image_reader) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + // --8<-- [end:ingest_data] + assert_eq!(table.count_rows(None).await.unwrap(), 2); + + // --8<-- [start:search_data] + let query_vector = vec![0.1_f32; 128]; + let results = table + .query() + .nearest_to(query_vector) + .unwrap() + .limit(1) + .execute() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + // --8<-- [end:search_data] + + // --8<-- [start:process_results] + for batch in &results { + let filenames = batch + .column_by_name("filename") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let images = batch + .column_by_name("image_blob") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + for row in 0..batch.num_rows() { + let image_bytes = images.value(row); + println!( + "Retrieved image: {}, Byte length: {}", + filenames.value(row), + image_bytes.len() + ); + } + } + // --8<-- [end:process_results] + let search_rows: usize = results.iter().map(|batch| batch.num_rows()).sum(); + assert_eq!(search_rows, 1); + + // --8<-- [start:blob_api_schema] + let blob_metadata = HashMap::from([( + "lance-encoding:blob".to_string(), + "true".to_string(), + )]); + let blob_schema = Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("video", DataType::LargeBinary, true).with_metadata(blob_metadata), + ]); + // --8<-- [end:blob_api_schema] + + // --8<-- [start:blob_api_ingest] + let blob_rows = vec![ + (1_i64, b"fake_video_bytes_1".to_vec()), + (2_i64, b"fake_video_bytes_2".to_vec()), + ]; + + let blob_schema = Arc::new(blob_schema); + let blob_batch = RecordBatch::try_new( + blob_schema.clone(), + vec![ + Arc::new(Int64Array::from_iter_values(blob_rows.iter().map(|row| row.0))), + Arc::new(LargeBinaryArray::from_iter_values( + blob_rows.iter().map(|row| row.1.as_slice()), + )), + ], + ) + .unwrap(); + let blob_reader = RecordBatchIterator::new(vec![Ok(blob_batch)].into_iter(), blob_schema); + let blob_table = db + .create_table("videos", blob_reader) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + // --8<-- [end:blob_api_ingest] + assert_eq!(blob_table.count_rows(None).await.unwrap(), 2); +} diff --git a/tests/rs/tables.rs b/tests/rs/tables.rs index fa167e7..65e0c20 100644 --- a/tests/rs/tables.rs +++ b/tests/rs/tables.rs @@ -2,24 +2,92 @@ // SPDX-FileCopyrightText: Copyright The LanceDB Authors use std::sync::Arc; +use std::time::Duration as StdDuration; use arrow_array::types::Float32Type; use arrow_array::{ - FixedSizeListArray, Float32Array, Float64Array, Int32Array, Int64Array, RecordBatch, - RecordBatchIterator, StringArray, + Array, FixedSizeListArray, Float32Array, Float64Array, Int32Array, Int64Array, RecordBatch, + RecordBatchIterator, RecordBatchReader, StringArray, }; use arrow_schema::{DataType, Field, Schema}; use lancedb::connect; use lancedb::database::CreateTableMode; -use lancedb::table::{ColumnAlteration, NewColumnTransform}; +use lancedb::table::{ColumnAlteration, Duration, NewColumnTransform, OptimizeAction}; + +// --8<-- [start:update_make_users_reader] +fn make_users_reader( + ids: Vec, + names: Vec<&str>, + login_counts: Option>, +) -> Box { + let mut fields = vec![ + Field::new("id", DataType::Int64, false), + Field::new("name", DataType::Utf8, false), + ]; + let mut columns: Vec> = + vec![Arc::new(Int64Array::from(ids)), Arc::new(StringArray::from(names))]; + + if let Some(login_counts) = login_counts { + fields.push(Field::new("login_count", DataType::Int64, true)); + columns.push(Arc::new(Int64Array::from(login_counts))); + } + + let schema = Arc::new(Schema::new(fields)); + let batch = RecordBatch::try_new(schema.clone(), columns).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)].into_iter(), schema); + Box::new(reader) +} +// --8<-- [end:update_make_users_reader] + +// --8<-- [start:versioning_make_quotes_reader] +fn make_quotes_reader(rows: Vec<(i64, &str, &str)>) -> Box { + let ids: Vec = rows.iter().map(|(id, _, _)| *id).collect(); + let authors: Vec<&str> = rows.iter().map(|(_, author, _)| *author).collect(); + let quotes: Vec<&str> = rows.iter().map(|(_, _, quote)| *quote).collect(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("author", DataType::Utf8, false), + Field::new("quote", DataType::Utf8, false), + ])); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int64Array::from(ids)), + Arc::new(StringArray::from(authors)), + Arc::new(StringArray::from(quotes)), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)].into_iter(), schema); + Box::new(reader) +} +// --8<-- [end:versioning_make_quotes_reader] + +#[allow(dead_code)] +async fn update_connect_cloud_example() { + // --8<-- [start:update_connect_cloud] + let uri = "db://your-project-slug"; + let api_key = "your-api-key"; + let region = "us-east-1"; + // --8<-- [end:update_connect_cloud] + let _ = (uri, api_key, region); +} + +#[allow(dead_code)] +async fn update_connect_local_example() { + // --8<-- [start:update_connect_local] + let db = connect("./data").execute().await.unwrap(); + // --8<-- [end:update_connect_local] + let _ = db; +} #[tokio::main] async fn main() { let temp_dir = tempfile::tempdir().unwrap(); - let db = connect(temp_dir.path().to_str().unwrap()) - .execute() - .await - .unwrap(); + let db_uri = temp_dir.path().to_str().unwrap().to_string(); + let db = connect(&db_uri).execute().await.unwrap(); // --8<-- [start:create_table_from_dicts] struct Location { @@ -623,4 +691,427 @@ async fn main() { .unwrap(); // --8<-- [end:alter_vector_column] assert_eq!(vector_table.count_rows(None).await.unwrap(), 1); + + // --8<-- [start:update_example_table_setup] + let table = db + .create_table( + "users_example", + make_users_reader(vec![1, 2], vec!["Alice", "Bob"], Some(vec![10, 20])), + ) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + // --8<-- [end:update_example_table_setup] + let _ = table; + + // --8<-- [start:update_operation] + let table = db + .create_table( + "users_example", + make_users_reader(vec![1, 2], vec!["Alice", "Bob"], Some(vec![10, 20])), + ) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + table + .update() + .only_if("id = 2") + .column("name", "'Bobby'") + .execute() + .await + .unwrap(); + // --8<-- [end:update_operation] + + // --8<-- [start:update_using_sql] + let table = db + .create_table( + "users_example", + make_users_reader(vec![1, 2], vec!["Alice", "Bob"], Some(vec![10, 20])), + ) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + table + .update() + .only_if("id = 2") + .column("login_count", "login_count + 1") + .execute() + .await + .unwrap(); + // --8<-- [end:update_using_sql] + + // --8<-- [start:merge_matched_update_only] + let table = db + .create_table( + "users_example", + make_users_reader(vec![1, 2], vec!["Alice", "Bob"], Some(vec![10, 20])), + ) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + + let mut merge_insert = table.merge_insert(&["id"]); + merge_insert.when_matched_update_all(None); + merge_insert + .execute(make_users_reader( + vec![2, 3], + vec!["Bobby", "Charlie"], + Some(vec![21, 5]), + )) + .await + .unwrap(); + // --8<-- [end:merge_matched_update_only] + + // --8<-- [start:insert_if_not_exists] + let table = db + .create_table( + "users_example", + make_users_reader(vec![1, 2], vec!["Alice", "Bob"], Some(vec![10, 20])), + ) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + + let mut merge_insert = table.merge_insert(&["id"]); + merge_insert.when_not_matched_insert_all(); + merge_insert + .execute(make_users_reader( + vec![2, 3], + vec!["Bobby", "Charlie"], + Some(vec![21, 5]), + )) + .await + .unwrap(); + // --8<-- [end:insert_if_not_exists] + + // --8<-- [start:merge_update_insert] + let table = db + .create_table( + "users_example", + make_users_reader(vec![1, 2], vec!["Alice", "Bob"], Some(vec![10, 20])), + ) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + + let mut merge_insert = table.merge_insert(&["id"]); + merge_insert + .when_matched_update_all(None) + .when_not_matched_insert_all(); + merge_insert + .execute(make_users_reader( + vec![2, 3], + vec!["Bobby", "Charlie"], + Some(vec![21, 5]), + )) + .await + .unwrap(); + // --8<-- [end:merge_update_insert] + + // --8<-- [start:merge_delete_missing_by_source] + let table = db + .create_table( + "users_example", + make_users_reader( + vec![1, 2, 3], + vec!["Alice", "Bob", "Charlie"], + Some(vec![10, 20, 5]), + ), + ) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + + let mut merge_insert = table.merge_insert(&["id"]); + merge_insert + .when_matched_update_all(None) + .when_not_matched_insert_all() + .when_not_matched_by_source_delete(None); + merge_insert + .execute(make_users_reader( + vec![2, 3], + vec!["Bobby", "Charlie"], + Some(vec![21, 5]), + )) + .await + .unwrap(); + // --8<-- [end:merge_delete_missing_by_source] + + // --8<-- [start:merge_partial_columns] + let table = db + .create_table( + "users_example", + make_users_reader(vec![1, 2], vec!["Alice", "Bob"], Some(vec![10, 20])), + ) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + + let mut merge_insert = table.merge_insert(&["id"]); + merge_insert + .when_matched_update_all(None) + .when_not_matched_insert_all(); + merge_insert + .execute(make_users_reader(vec![2, 3], vec!["Bobby", "Charlie"], None)) + .await + .unwrap(); + // --8<-- [end:merge_partial_columns] + + let table = db + .create_table( + "users_example", + make_users_reader( + vec![1, 2, 3], + vec!["Alice", "Bob", "Charlie"], + Some(vec![10, 20, 5]), + ), + ) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + + // --8<-- [start:delete_operation] + // delete data + let predicate = "id = 3"; + table.delete(predicate).await.unwrap(); + // --8<-- [end:delete_operation] + + let table = db + .create_table( + "users_cleanup_example", + make_users_reader( + vec![1, 2, 3], + vec!["Alice", "Bob", "Charlie"], + Some(vec![10, 20, 5]), + ), + ) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + + // --8<-- [start:update_optimize_cleanup] + table + .optimize(OptimizeAction::Prune { + older_than: Some(Duration::days(1)), + delete_unverified: None, + error_if_tagged_old_versions: None, + }) + .await + .unwrap(); + // --8<-- [end:update_optimize_cleanup] + + // --8<-- [start:consistency_strong] + let strong_writer_db = connect(&db_uri).execute().await.unwrap(); + let strong_reader_db = connect(&db_uri) + .read_consistency_interval(StdDuration::from_secs(0)) + .execute() + .await + .unwrap(); + let strong_writer_table = strong_writer_db + .create_table( + "consistency_strong_table", + make_users_reader(vec![1], vec!["Alice"], None), + ) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + let strong_reader_table = strong_reader_db + .open_table("consistency_strong_table") + .execute() + .await + .unwrap(); + strong_writer_table + .add(make_users_reader(vec![2], vec!["Bob"], None)) + .execute() + .await + .unwrap(); + let strong_rows_after_write = strong_reader_table.count_rows(None).await.unwrap(); + println!( + "Rows visible with strong consistency: {}", + strong_rows_after_write + ); + // --8<-- [end:consistency_strong] + assert_eq!(strong_rows_after_write, 2); + + // --8<-- [start:consistency_eventual] + let eventual_writer_db = connect(&db_uri).execute().await.unwrap(); + let eventual_reader_db = connect(&db_uri) + .read_consistency_interval(StdDuration::from_secs(3600)) + .execute() + .await + .unwrap(); + let eventual_writer_table = eventual_writer_db + .create_table( + "consistency_eventual_table", + make_users_reader(vec![1], vec!["Alice"], None), + ) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + let eventual_reader_table = eventual_reader_db + .open_table("consistency_eventual_table") + .execute() + .await + .unwrap(); + eventual_writer_table + .add(make_users_reader(vec![2], vec!["Bob"], None)) + .execute() + .await + .unwrap(); + let eventual_rows_after_write = eventual_reader_table.count_rows(None).await.unwrap(); + println!( + "Rows visible before eventual refresh interval: {}", + eventual_rows_after_write + ); + // --8<-- [end:consistency_eventual] + assert_eq!(eventual_rows_after_write, 1); + + // --8<-- [start:consistency_checkout_latest] + let checkout_writer_db = connect(&db_uri).execute().await.unwrap(); + let checkout_reader_db = connect(&db_uri).execute().await.unwrap(); + let checkout_writer_table = checkout_writer_db + .create_table( + "consistency_checkout_latest_table", + make_users_reader(vec![1], vec!["Alice"], None), + ) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + let checkout_reader_table = checkout_reader_db + .open_table("consistency_checkout_latest_table") + .execute() + .await + .unwrap(); + checkout_writer_table + .add(make_users_reader(vec![2], vec!["Bob"], None)) + .execute() + .await + .unwrap(); + let rows_before_refresh = checkout_reader_table.count_rows(None).await.unwrap(); + println!("Rows before checkout_latest: {}", rows_before_refresh); + checkout_reader_table.checkout_latest().await.unwrap(); + let rows_after_refresh = checkout_reader_table.count_rows(None).await.unwrap(); + println!("Rows after checkout_latest: {}", rows_after_refresh); + // --8<-- [end:consistency_checkout_latest] + assert_eq!(rows_before_refresh, 1); + assert_eq!(rows_after_refresh, 2); + + // --8<-- [start:versioning_basic_setup] + let table_name = "quotes_versioning_example"; + let data = vec![ + (1, "Richard", "Wubba Lubba Dub Dub!"), + (2, "Morty", "Rick, what's going on?"), + (3, "Richard", "I turned myself into a pickle, Morty!"), + ]; + + let table = db + .create_table(table_name, make_quotes_reader(data)) + .mode(CreateTableMode::Overwrite) + .execute() + .await + .unwrap(); + // --8<-- [end:versioning_basic_setup] + assert_eq!(table.count_rows(None).await.unwrap(), 3); + + // --8<-- [start:versioning_check_initial_version] + let versions = table.list_versions().await.unwrap(); + let current_version = table.version().await.unwrap(); + println!("Number of versions after creation: {}", versions.len()); + println!("Current version: {}", current_version); + // --8<-- [end:versioning_check_initial_version] + assert_eq!(versions.len(), 1); + assert_eq!(current_version, versions.last().unwrap().version); + + // --8<-- [start:versioning_update_data] + table + .update() + .only_if("author = 'Richard'") + .column("author", "'Richard Daniel Sanchez'") + .execute() + .await + .unwrap(); + let rows_after_update = table + .count_rows(Some("author = 'Richard Daniel Sanchez'".to_string())) + .await + .unwrap(); + println!( + "Rows updated to Richard Daniel Sanchez: {}", + rows_after_update + ); + // --8<-- [end:versioning_update_data] + assert_eq!(rows_after_update, 2); + + // --8<-- [start:versioning_add_data] + let more_data = vec![ + (4, "Richard Daniel Sanchez", "That's the way the news goes!"), + (5, "Morty", "Aww geez, Rick!"), + ]; + table + .add(make_quotes_reader(more_data)) + .execute() + .await + .unwrap(); + // --8<-- [end:versioning_add_data] + assert_eq!(table.count_rows(None).await.unwrap(), 5); + + // --8<-- [start:versioning_check_versions_after_mod] + let versions_after_mod = table.list_versions().await.unwrap(); + let version_count_after_mod = versions_after_mod.len(); + let version_after_mod = table.version().await.unwrap(); + println!( + "Number of versions after modifications: {}", + version_count_after_mod + ); + println!("Current version: {}", version_after_mod); + // --8<-- [end:versioning_check_versions_after_mod] + assert!(version_count_after_mod >= 2); + assert_eq!(version_after_mod, versions_after_mod.last().unwrap().version); + + // --8<-- [start:versioning_list_all_versions] + let all_versions = table.list_versions().await.unwrap(); + for v in &all_versions { + println!("Version {}, created at {}", v.version, v.timestamp); + } + // --8<-- [end:versioning_list_all_versions] + assert!(!all_versions.is_empty()); + + // --8<-- [start:versioning_rollback] + table.checkout(version_after_mod).await.unwrap(); + table.restore().await.unwrap(); + let versions_after_rollback = table.list_versions().await.unwrap(); + let version_count_after_rollback = versions_after_rollback.len(); + println!( + "Total number of versions after rollback: {}", + version_count_after_rollback + ); + // --8<-- [end:versioning_rollback] + assert_eq!(version_count_after_rollback, version_count_after_mod + 1); + assert_eq!(table.count_rows(None).await.unwrap(), 5); + + // --8<-- [start:versioning_checkout_latest] + table.checkout_latest().await.unwrap(); + // --8<-- [end:versioning_checkout_latest] + let latest_version = table.version().await.unwrap(); + let versions_after_checkout = table.list_versions().await.unwrap(); + assert_eq!(latest_version, versions_after_checkout.last().unwrap().version); + + // --8<-- [start:versioning_delete_data] + table.delete("author = 'Morty'").await.unwrap(); + let rows_after_deletion = table.count_rows(None).await.unwrap(); + println!("Number of rows after deletion: {}", rows_after_deletion); + // --8<-- [end:versioning_delete_data] + assert_eq!(rows_after_deletion, 3); } diff --git a/tests/ts/multimodal.test.ts b/tests/ts/multimodal.test.ts new file mode 100644 index 0000000..ac9a820 --- /dev/null +++ b/tests/ts/multimodal.test.ts @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The LanceDB Authors +import { expect, test } from "@jest/globals"; +// --8<-- [start:multimodal_imports] +import * as arrow from "apache-arrow"; +import { Buffer } from "node:buffer"; +import * as lancedb from "@lancedb/lancedb"; +// --8<-- [end:multimodal_imports] +import { withTempDirectory } from "./util.ts"; + +test("multimodal snippets (async)", async () => { + await withTempDirectory(async (databaseDir) => { + const db = await lancedb.connect(databaseDir); + + // --8<-- [start:create_dummy_data] + const createDummyImage = (color: string): Uint8Array => { + const pngHeader = Uint8Array.from([137, 80, 78, 71, 13, 10, 26, 10]); + return Buffer.concat([Buffer.from(pngHeader), Buffer.from(color, "utf8")]); + }; + + const data = [ + { + id: 1, + filename: "red_square.png", + vector: Array.from({ length: 128 }, (_, i) => (i % 16) / 16), + image_blob: createDummyImage("red"), + label: "red", + }, + { + id: 2, + filename: "blue_square.png", + vector: Array.from({ length: 128 }, (_, i) => ((i + 8) % 16) / 16), + image_blob: createDummyImage("blue"), + label: "blue", + }, + ]; + // --8<-- [end:create_dummy_data] + + // --8<-- [start:define_schema] + const schema = new arrow.Schema([ + new arrow.Field("id", new arrow.Int32()), + new arrow.Field("filename", new arrow.Utf8()), + new arrow.Field( + "vector", + new arrow.FixedSizeList( + 128, + new arrow.Field("item", new arrow.Float32(), true), + ), + ), + new arrow.Field("image_blob", new arrow.Binary()), + new arrow.Field("label", new arrow.Utf8()), + ]); + // --8<-- [end:define_schema] + + // --8<-- [start:ingest_data] + const multimodalData = lancedb.makeArrowTable(data, { schema }); + const tbl = await db.createTable("images", multimodalData, { + mode: "overwrite", + }); + // --8<-- [end:ingest_data] + expect(await tbl.countRows()).toBe(2); + + // --8<-- [start:search_data] + const queryVector = Array.from({ length: 128 }, (_, i) => (i % 16) / 16); + const results = await tbl.search(queryVector).limit(1).toArray(); + // --8<-- [end:search_data] + + // --8<-- [start:process_results] + for (const row of results) { + const imageBytes = row.image_blob as Uint8Array; + console.log( + `Retrieved image: ${row.filename}, Byte length: ${imageBytes.length}`, + ); + } + // --8<-- [end:process_results] + expect(results).toHaveLength(1); + + // --8<-- [start:blob_api_schema] + const blobSchema = new arrow.Schema([ + new arrow.Field("id", new arrow.Int64()), + new arrow.Field( + "video", + new arrow.LargeBinary(), + true, + new Map([["lance-encoding:blob", "true"]]), + ), + ]); + // --8<-- [end:blob_api_schema] + + // --8<-- [start:blob_api_ingest] + const blobData = lancedb.makeArrowTable( + [ + { id: 1, video: Buffer.from("fake_video_bytes_1") }, + { id: 2, video: Buffer.from("fake_video_bytes_2") }, + ], + { schema: blobSchema }, + ); + const blobTable = await db.createTable("videos", blobData, { + mode: "overwrite", + }); + // --8<-- [end:blob_api_ingest] + expect(await blobTable.countRows()).toBe(2); + }); +}); diff --git a/tests/ts/tables.test.ts b/tests/ts/tables.test.ts index b565522..8b345d1 100644 --- a/tests/ts/tables.test.ts +++ b/tests/ts/tables.test.ts @@ -5,6 +5,23 @@ import * as arrow from "apache-arrow"; import * as lancedb from "@lancedb/lancedb"; import { withTempDirectory } from "./util.ts"; +async function updateConnectCloudExample() { + // --8<-- [start:update_connect_cloud] + const db = await lancedb.connect("db://your-project-slug", { + apiKey: "your-api-key", + region: "us-east-1", + }); + // --8<-- [end:update_connect_cloud] + return db; +} + +async function updateConnectLocalExample() { + // --8<-- [start:update_connect_local] + const db = await lancedb.connect("./data"); + // --8<-- [end:update_connect_local] + return db; +} + test("table creation snippets (async)", async () => { await withTempDirectory(async (databaseDir) => { const db = await lancedb.connect(databaseDir); @@ -397,3 +414,390 @@ test("schema evolution snippets (async)", async () => { expect(await vectorTable.countRows()).toBe(1); }); }); + +test("update snippets (async)", async () => { + // Keep connection snippets in this file, but do not run cloud/local examples in CI. + void updateConnectCloudExample; + void updateConnectLocalExample; + + await withTempDirectory(async (databaseDir) => { + const db = await lancedb.connect(databaseDir); + + { + // --8<-- [start:update_example_table_setup] + const table = await db.createTable( + "users_example", + [ + { id: 1, name: "Alice", login_count: 10 }, + { id: 2, name: "Bob", login_count: 20 }, + ], + { mode: "overwrite" }, + ); + // --8<-- [end:update_example_table_setup] + await table.countRows(); + } + + { + // --8<-- [start:update_operation] + const table = await db.createTable( + "users_example", + [ + { id: 1, name: "Alice", login_count: 10 }, + { id: 2, name: "Bob", login_count: 20 }, + ], + { mode: "overwrite" }, + ); + await table.update({ where: "id = 2", values: { name: "Bobby" } }); + // --8<-- [end:update_operation] + await table.countRows(); + } + + { + // --8<-- [start:update_using_sql] + const table = await db.createTable( + "users_example", + [ + { id: 1, name: "Alice", login_count: 10 }, + { id: 2, name: "Bob", login_count: 20 }, + ], + { mode: "overwrite" }, + ); + await table.update({ + where: "id = 2", + valuesSql: { login_count: "login_count + 1" }, + }); + // --8<-- [end:update_using_sql] + await table.countRows(); + } + + { + // --8<-- [start:merge_matched_update_only] + const table = await db.createTable( + "users_example", + [ + { id: 1, name: "Alice", login_count: 10 }, + { id: 2, name: "Bob", login_count: 20 }, + ], + { mode: "overwrite" }, + ); + + const incomingUsers = [ + { id: 2, name: "Bobby", login_count: 21 }, + { id: 3, name: "Charlie", login_count: 5 }, + ]; + + await table + .mergeInsert("id") + .whenMatchedUpdateAll() + .execute(incomingUsers); + // --8<-- [end:merge_matched_update_only] + await table.countRows(); + } + + { + // --8<-- [start:insert_if_not_exists] + const table = await db.createTable( + "users_example", + [ + { id: 1, name: "Alice", login_count: 10 }, + { id: 2, name: "Bob", login_count: 20 }, + ], + { mode: "overwrite" }, + ); + + const incomingUsers = [ + { id: 2, name: "Bobby", login_count: 21 }, + { id: 3, name: "Charlie", login_count: 5 }, + ]; + + await table + .mergeInsert("id") + .whenNotMatchedInsertAll() + .execute(incomingUsers); + // --8<-- [end:insert_if_not_exists] + await table.countRows(); + } + + { + // --8<-- [start:merge_update_insert] + const table = await db.createTable( + "users_example", + [ + { id: 1, name: "Alice", login_count: 10 }, + { id: 2, name: "Bob", login_count: 20 }, + ], + { mode: "overwrite" }, + ); + + const incomingUsers = [ + { id: 2, name: "Bobby", login_count: 21 }, + { id: 3, name: "Charlie", login_count: 5 }, + ]; + + await table + .mergeInsert("id") + .whenMatchedUpdateAll() + .whenNotMatchedInsertAll() + .execute(incomingUsers); + // --8<-- [end:merge_update_insert] + await table.countRows(); + } + + { + // --8<-- [start:merge_delete_missing_by_source] + const table = await db.createTable( + "users_example", + [ + { id: 1, name: "Alice", login_count: 10 }, + { id: 2, name: "Bob", login_count: 20 }, + { id: 3, name: "Charlie", login_count: 5 }, + ], + { mode: "overwrite" }, + ); + + const incomingUsers = [ + { id: 2, name: "Bobby", login_count: 21 }, + { id: 3, name: "Charlie", login_count: 5 }, + ]; + + await table + .mergeInsert("id") + .whenMatchedUpdateAll() + .whenNotMatchedInsertAll() + .whenNotMatchedBySourceDelete() + .execute(incomingUsers); + // --8<-- [end:merge_delete_missing_by_source] + await table.countRows(); + } + + { + // --8<-- [start:merge_partial_columns] + const table = await db.createTable( + "users_example", + [ + { id: 1, name: "Alice", login_count: 10 }, + { id: 2, name: "Bob", login_count: 20 }, + ], + { mode: "overwrite" }, + ); + + const incomingUsers = [ + { id: 2, name: "Bobby" }, + { id: 3, name: "Charlie" }, + ]; + + await table + .mergeInsert("id") + .whenMatchedUpdateAll() + .whenNotMatchedInsertAll() + .execute(incomingUsers); + // --8<-- [end:merge_partial_columns] + await table.countRows(); + } + + { + const table = await db.createTable( + "users_example", + [ + { id: 1, name: "Alice", login_count: 10 }, + { id: 2, name: "Bob", login_count: 20 }, + { id: 3, name: "Charlie", login_count: 5 }, + ], + { mode: "overwrite" }, + ); + + // --8<-- [start:delete_operation] + // delete data + const predicate = "id = 3"; + await table.delete(predicate); + // --8<-- [end:delete_operation] + await table.countRows(); + } + + { + const table = await db.createTable( + "users_cleanup_example", + [ + { id: 1, name: "Alice", login_count: 10 }, + { id: 2, name: "Bob", login_count: 20 }, + { id: 3, name: "Charlie", login_count: 5 }, + ], + { mode: "overwrite" }, + ); + + // --8<-- [start:update_optimize_cleanup] + const olderThan = new Date(); + olderThan.setDate(olderThan.getDate() - 1); + await table.optimize({ cleanupOlderThan: olderThan }); + // --8<-- [end:update_optimize_cleanup] + } + }); +}); + +test("versioning snippets (async)", async () => { + await withTempDirectory(async (databaseDir) => { + const db = await lancedb.connect(databaseDir); + + // --8<-- [start:versioning_basic_setup] + const tableName = "quotes_versioning_example"; + const data = [ + { id: 1, author: "Richard", quote: "Wubba Lubba Dub Dub!" }, + { id: 2, author: "Morty", quote: "Rick, what's going on?" }, + { + id: 3, + author: "Richard", + quote: "I turned myself into a pickle, Morty!", + }, + ]; + const table = await db.createTable(tableName, data, { mode: "overwrite" }); + // --8<-- [end:versioning_basic_setup] + expect(await table.countRows()).toBe(3); + + // --8<-- [start:versioning_check_initial_version] + const versions = await table.listVersions(); + const currentVersion = await table.version(); + console.log(`Number of versions after creation: ${versions.length}`); + console.log(`Current version: ${currentVersion}`); + // --8<-- [end:versioning_check_initial_version] + expect(versions.length).toBe(1); + expect(currentVersion).toBe(versions[versions.length - 1].version); + + // --8<-- [start:versioning_update_data] + await table.update({ + where: "author = 'Richard'", + values: { author: "Richard Daniel Sanchez" }, + }); + const rowsAfterUpdate = await table.countRows( + "author = 'Richard Daniel Sanchez'", + ); + console.log(`Rows updated to Richard Daniel Sanchez: ${rowsAfterUpdate}`); + // --8<-- [end:versioning_update_data] + expect(rowsAfterUpdate).toBe(2); + + // --8<-- [start:versioning_add_data] + const moreData = [ + { + id: 4, + author: "Richard Daniel Sanchez", + quote: "That's the way the news goes!", + }, + { id: 5, author: "Morty", quote: "Aww geez, Rick!" }, + ]; + await table.add(moreData); + // --8<-- [end:versioning_add_data] + expect(await table.countRows()).toBe(5); + + // --8<-- [start:versioning_check_versions_after_mod] + const versionsAfterMod = await table.listVersions(); + const versionCountAfterMod = versionsAfterMod.length; + const versionAfterMod = await table.version(); + console.log( + `Number of versions after modifications: ${versionCountAfterMod}`, + ); + console.log(`Current version: ${versionAfterMod}`); + // --8<-- [end:versioning_check_versions_after_mod] + expect(versionCountAfterMod).toBeGreaterThanOrEqual(2); + expect(versionAfterMod).toBe(versionsAfterMod[versionsAfterMod.length - 1].version); + + // --8<-- [start:versioning_list_all_versions] + const allVersions = await table.listVersions(); + for (const v of allVersions) { + console.log(`Version ${v.version}, created at ${v.timestamp}`); + } + // --8<-- [end:versioning_list_all_versions] + expect(allVersions.length).toBeGreaterThanOrEqual(1); + + // --8<-- [start:versioning_rollback] + await table.checkout(versionAfterMod); + await table.restore(); + const versionsAfterRollback = await table.listVersions(); + const versionCountAfterRollback = versionsAfterRollback.length; + console.log( + `Total number of versions after rollback: ${versionCountAfterRollback}`, + ); + // --8<-- [end:versioning_rollback] + expect(versionCountAfterRollback).toBe(versionCountAfterMod + 1); + expect(await table.countRows()).toBe(5); + + // --8<-- [start:versioning_checkout_latest] + await table.checkoutLatest(); + // --8<-- [end:versioning_checkout_latest] + const latestVersion = await table.version(); + const versionsAfterCheckout = await table.listVersions(); + expect(latestVersion).toBe( + versionsAfterCheckout[versionsAfterCheckout.length - 1].version, + ); + + // --8<-- [start:versioning_delete_data] + await table.delete("author = 'Morty'"); + const rowsAfterDeletion = await table.countRows(); + console.log(`Number of rows after deletion: ${rowsAfterDeletion}`); + // --8<-- [end:versioning_delete_data] + expect(rowsAfterDeletion).toBe(3); + }); +}); + +test("consistency snippets (async)", async () => { + await withTempDirectory(async (databaseDir) => { + // --8<-- [start:consistency_strong] + const strongWriterDb = await lancedb.connect(databaseDir); + const strongReaderDb = await lancedb.connect(databaseDir, { + readConsistencyInterval: 0, + }); + const strongWriterTable = await strongWriterDb.createTable( + "consistency_strong_table", + [{ id: 1 }], + { mode: "overwrite" }, + ); + const strongReaderTable = await strongReaderDb.openTable( + "consistency_strong_table", + ); + await strongWriterTable.add([{ id: 2 }]); + const strongRowsAfterWrite = await strongReaderTable.countRows(); + console.log(`Rows visible with strong consistency: ${strongRowsAfterWrite}`); + // --8<-- [end:consistency_strong] + expect(strongRowsAfterWrite).toBe(2); + + // --8<-- [start:consistency_eventual] + const eventualWriterDb = await lancedb.connect(databaseDir); + const eventualReaderDb = await lancedb.connect(databaseDir, { + readConsistencyInterval: 3600, + }); + const eventualWriterTable = await eventualWriterDb.createTable( + "consistency_eventual_table", + [{ id: 1 }], + { mode: "overwrite" }, + ); + const eventualReaderTable = await eventualReaderDb.openTable( + "consistency_eventual_table", + ); + await eventualWriterTable.add([{ id: 2 }]); + const eventualRowsAfterWrite = await eventualReaderTable.countRows(); + console.log( + `Rows visible before eventual refresh interval: ${eventualRowsAfterWrite}`, + ); + // --8<-- [end:consistency_eventual] + expect(eventualRowsAfterWrite).toBe(1); + + // --8<-- [start:consistency_checkout_latest] + const checkoutWriterDb = await lancedb.connect(databaseDir); + const checkoutReaderDb = await lancedb.connect(databaseDir); + const checkoutWriterTable = await checkoutWriterDb.createTable( + "consistency_checkout_latest_table", + [{ id: 1 }], + { mode: "overwrite" }, + ); + const checkoutReaderTable = await checkoutReaderDb.openTable( + "consistency_checkout_latest_table", + ); + await checkoutWriterTable.add([{ id: 2 }]); + const rowsBeforeRefresh = await checkoutReaderTable.countRows(); + console.log(`Rows before checkoutLatest: ${rowsBeforeRefresh}`); + await checkoutReaderTable.checkoutLatest(); + const rowsAfterRefresh = await checkoutReaderTable.countRows(); + console.log(`Rows after checkoutLatest: ${rowsAfterRefresh}`); + // --8<-- [end:consistency_checkout_latest] + expect(rowsBeforeRefresh).toBe(1); + expect(rowsAfterRefresh).toBe(2); + }); +});