diff --git a/astro.config.mjs b/astro.config.mjs index e2db76fa..7e4ce164 100644 --- a/astro.config.mjs +++ b/astro.config.mjs @@ -61,6 +61,25 @@ export default defineConfig({ { label: 'Run graph algorithms', link: '/get-started/graph-algorithms' }, ] }, + { + label: 'Import data', + collapsed: true, + items: [ + { label: 'Overview', link: '/import' }, + { label: 'Copy from CSV', link: '/import/csv' }, + { label: 'Copy from Parquet', link: '/import/parquet' }, + { label: 'Copy from NumPy', link: '/import/npy', badge: { text: 'Experimental', variant: 'danger'}}, + ] + }, + { + label: 'Export data', + collapsed: true, + items: [ + { label: 'Overview', link: '/export' }, + { label: 'Copy to CSV', link: '/export/csv' }, + { label: 'Copy to Parquet', link: '/export/parquet' }, + ] + }, { label: 'Visualize graphs', link: '/visualization', @@ -75,7 +94,7 @@ export default defineConfig({ { label: 'Create your first RDF graph', link: '/rdf-graphs/example-rdfgraph' }, { label: 'Query an RDF graph in Cypher', link: '/rdf-graphs/rdfgraphs-overview' }, { label: 'RDF bulk data import', link: '/rdf-graphs/rdf-import' }, - { label: 'Example RDFGraphs', link: '/rdf-graphs/rdfgraphs-repo' }, + { label: 'Preloaded RDFGraphs', link: '/rdf-graphs/rdfgraphs-repo' }, ], autogenerate: { directory: 'reference' }, }, diff --git a/src/content/docs/export/csv.md b/src/content/docs/export/csv.md new file mode 100644 index 00000000..5300fe94 --- /dev/null +++ b/src/content/docs/export/csv.md @@ -0,0 +1,48 @@ +--- +title: Export CSV +--- + +The `COPY TO` clause can export query results to a CSV file, and is used as follows: + +```cypher +COPY (MATCH (u:User) RETURN u.*) TO 'user.csv' (header=true); +``` + +The CSV file consists of the following fields: + +```csv +u.name,u.age +Adam,30 +Karissa,40 +Zhang,50 +Noura,25 +``` + +Nested data types like lists and structs will be represented as strings within their respective columns. + +Available options are: + +
+ +| Option | Default Value | Description | +|:------------------------:|:-----------------------:|---------------------------------------------------------------------------| +| `ESCAPE` | `\` | Character used to escape special characters in CSV | +| `DELIM` | `,` | Character that separates fields in the CSV | +| `QUOTE` | `"` | Character used to enclose fields containing special characters or spaces | +| `Header` | `false` | Indicates whether to output a header row | + +
+ +Another example is shown below. + +```cypher +COPY (MATCH (a:User)-[f:Follows]->(b:User) RETURN a.name, f.since, b.name) TO 'follows.csv' (header=false, delim='|'); +``` + +This outputs the following results to `follows.csv`: +```csv +Adam|2020|Karissa +Adam|2020|Zhang +Karissa|2021|Zhang +Zhang|2022|Noura +``` diff --git a/src/content/docs/export/index.mdx b/src/content/docs/export/index.mdx new file mode 100644 index 00000000..078cb163 --- /dev/null +++ b/src/content/docs/export/index.mdx @@ -0,0 +1,23 @@ +--- +title: Overview +--- + +import { LinkCard } from '@astrojs/starlight/components'; + +The `COPY TO` command allows you to export query results directly to the specified file format. This +is useful when you want to persist the results of a query to be used in other systems, or for +archiving purposes. + +## `COPY TO` CSV + + + +## `COPY TO` Parquet + + diff --git a/src/content/docs/export/parquet.md b/src/content/docs/export/parquet.md new file mode 100644 index 00000000..92c2ee6e --- /dev/null +++ b/src/content/docs/export/parquet.md @@ -0,0 +1,33 @@ +--- +title: Export Parquet +--- + +The `COPY TO` clause can export query results to a Parquet file. It can be combined with a subquery +and used as shown below. + +```cypher +COPY (MATCH (u:User) return u.*) TO 'user.parquet'; +``` + +The `LOAD FROM` clause can used to scan the Parquet file and to verify that the export worked: + +```cypher +> LOAD FROM 'user.parquet' RETURN *; +------------------- +| u.name | u.age | +------------------- +| Adam | 30 | +------------------- +| Karissa | 40 | +------------------- +| Zhang | 50 | +------------------- +| Noura | 25 | +------------------- +``` + +:::caution[Notes] +- Exporting [fixed list](../cypher/data-types#list) or [variant](../../cypher/data-types/variant) data types to Parquet are not yet supported. +- [UNION](../../cypher/data-types/union) is exported as a [STRUCT](../../cypher/data-types/struct), which is the internal representation of the `Union` data type. +- Currently, only Snappy compression is supported for exports. +::: diff --git a/src/content/docs/import/csv.md b/src/content/docs/import/csv.md new file mode 100644 index 00000000..a104e725 --- /dev/null +++ b/src/content/docs/import/csv.md @@ -0,0 +1,116 @@ +--- +title: Import data from CSV files +--- + +You can bulk import data to node and relationship tables from CSV files +using the `COPY FROM` command. It is **highly recommended** to use `COPY FROM` if you are creating large +databases. + +The CSV import configuration can be manually set by specifying the parameters inside `( )` at the +end of the the `COPY FROM` clause. The following table shows the configuration parameters supported: + +| Parameter | Description | Default Value | +|:-----|:-----|:-----| +| `HEADER` | Whether the first line of the CSV file is the header. Can be true or false. | false | +| `DELIM` | Character that separates different columns in a lines. | `,`| +| `QUOTE` | Character to start a string quote. | `"` | +| `ESCAPE` | Character within string quotes to escape QUOTE and other characters, e.g., a line break.
See the important note below about line breaks lines below.| `\` | +| `LIST_BEGIN`/`LIST_END` | For the [list data type](../cypher/data-types/list.md), the delimiters to specify
list begin and list end characters | `[`, `]`| +| `PARALLEL` | Read csv files in parallel or not | true | + +The example below specifies that the CSV delimiter is`|` and also that the header row exists. + +```cypher +COPY User FROM "user.csv" (HEADER=true, DELIM="|"); +``` + +:::caution[Guidelines] +- **Start with empty tables:** `COPY FROM` commands can be used when your tables are completely empty. So you should use `COPY FROM` immediately after you define the schemas of your tables. +- **Copy nodes before relationships:** In order to copy a relationship table `R` from a csv file `RFile`, the nodes that appear in `RFile` need to +already exist in the database (either imported in bulk or inserted through Cypher data manipulation commands). +- **Wrap strings inside quotes:** Kùzu will accept strings in string columns both with and without quotes, though it's recommended to wrap strings in quotes to avoid any ambiguity with delimiters. +- **Avoid leading and trailing spaces**: As per the CSV standard, Kùzu does not ignore leading and trailing spaces (e.g., if you input ` 213 ` for + an integer value, that will be read as malformed integer and the corresponding node/rel property will be set to NULL. +::: + +## Import to node table + +Create a node table `User` as follows: + +```cypher +CREATE NODE TABLE User(name STRING, age INT64, reg_date DATE, PRIMARY KEY (name)) +``` + +The CSV file `user.csv` contains the following fields: +```csv +name,age,reg_date +Adam,30,2020-06-22 +Karissa,40,2019-05-12 +... +``` + +The following statement will load `user.csv` into User table. + +```cypher +COPY User FROM "user.csv" (header=true); +``` + +## Import to relationship table + +When loading into a relationship table, Kùzu assumes the first two columns in the file are: + +- `FROM` Node Column: The primary key of the `FROM` nodes. +- `TO` Node Column: The primary key of the `TO` nodes. + +The rest of the columns correspond to relationship properties. + +Create a relationship table `Follows` using the following Cypher query: + +```cypher +CREATE REL TABLE Follows(FROM User TO User, since DATE) +``` + +This reads data from the below CSV file `follows.csv`: +```csv +Adam,Karissa,2010-01-30 +Karissa,Michelle,2014-01-30 +... +``` + +The following statement loads the `follows.csv` file into a `Follows` table. + +```cypher +COPY Follows FROM "follows.csv"; +``` + +Note that the header wasn't present in the CSV file, hence the `header` parameter is not set. + +## Import multiple files to a single table + +It is common practice to divide a large CSV file into several smaller files for cleaner data management. +Kùzu can read multiple files with the same structure, consolidating their data into a single node or relationship table. +You can specify that multiple files are loaded in the following ways: + +### Glob pattern + +This is similar to the Unix [glob](https://man7.org/linux/man-pages/man7/glob.7.html) pattern, where you specify +file paths that match a given pattern. The following wildcard characters are supported: + +| Wildcard | Description | +| :-----------: | ----------- | +| `*` | match any number of any characters (including none) | +| `?` | match any single character | +| `[abc]` | match any one of the characters enclosed within the brackets | +| `[a-z]` | match any one of the characters within the range | + +```cypher +COPY User FROM "User*.csv" +``` + +### List of files + +Alternatively, you can just specify a list of files to be loaded. + +```cypher +COPY User FROM ["User0.csv", "User0.csv", "User2.csv"] +``` diff --git a/src/content/docs/import/index.mdx b/src/content/docs/import/index.mdx new file mode 100644 index 00000000..90576366 --- /dev/null +++ b/src/content/docs/import/index.mdx @@ -0,0 +1,46 @@ +--- +title: Overview +--- + +import { LinkCard } from '@astrojs/starlight/components'; + +There are multiple ways to import data in Kùzu. The only prerequisite for inserting data +into a database is that you first create a graph schema, i.e., the structure of your node and relationship tables. + +For small graphs (a few thousand nodes), the `CREATE` and `MERGE` [Cypher clauses](../cypher/data-manipulation-clauses) +can be used to insert nodes and +relationships. These are similar to SQL's `INSERT` statements, but bear in mind that they are slower than the bulk import +options shown below. The `CREATE`/`MERGE` clauses are intended to do small additions or updates on a sporadic basis. + +In general, the recommended approach is to use `COPY FROM` (rather than creating or +merging nodes one by one), for larger graphs of millions of nodes and beyond. For now, the `COPY FROM` +commands can only be used when tables are empty. + +## `COPY FROM` CSV + +The `COPY FROM` command is used to bulk import data from a CSV file into a node or relationship table. +See the linked card below for more information and examples. + + + +## `COPY FROM` Parquet + +Similar to CSV, the `COPY FROM` command is used to bulk import data from a Parquet file into a node or relationship table. +See the linked card below for more information and examples. + + + +## `COPY FROM` NumPy + +Importing from NumPy is a specific use case that allows you to import numeric data from a NumPy file into a node table. + + \ No newline at end of file diff --git a/src/content/docs/import/npy.md b/src/content/docs/import/npy.md new file mode 100644 index 00000000..7c0c1338 --- /dev/null +++ b/src/content/docs/import/npy.md @@ -0,0 +1,44 @@ +--- +title: Import NumPy +--- + +The `.npy` format is the standard binary file format in [NumPy](https://numpy.org/) for persisting a +single arbitrary NumPy array on disk. + +The primary use case for bulk loading NumPy files is to load +large node features or vectors that are stored in `.npy` format. You can use the `COPY FROM` statement +to import a set of `*.npy` files into a node table. + +:::caution[Notes] +This feature is an experimental feature and will evolve. Currently, this feature has the following constraints: +- **Import to node table only**: For now, Kùzu supports loading `.npy` files into **node tables** only. +- **Start with empty tables**: `COPY FROM` commands can be used when your tables are completely empty. +So you should use `COPY FROM` immediately after you define the schemas of your tables. +- **NPY file mapped to column**: Each `.npy` file will be loaded as a node table column. So, in the `COPY FROM` statement, the +number of `.npy` files must be equal to the number of columns defined in DDL. +- **Numerical types only**: A `.npy` file can only contain numerical values. +::: + +## Import to node table +Consider a `Paper` table with an `id` column, a feature column that is an embedding (vector) with 768 dimensions, +a `year` column and a `label` column as ground truth. We first define the schema with the following statement: + +```cypher +CREATE NODE TABLE Paper(id INT64, feat FLOAT[768], year INT64, label DOUBLE, PRIMARY KEY(id)); +``` + +The raw data is stored in `.npy` format where each column is represented as a NumPy array on disk. The files are +specified below: + +``` +node_id.npy", "node_feat_f32.npy", "node_year.npy", "node_label.npy" +``` + +We can copy the files with the following statement: + +```cypher +COPY Paper FROM ("node_id.npy", "node_feat_f32.npy", "node_year.npy", "node_label.npy") BY COLUMN; +``` + +As stated before, the number of `*.npy` files must equal the number of columns, and must also be +specified in the same order as they are defined in the DDL. diff --git a/src/content/docs/import/parquet.md b/src/content/docs/import/parquet.md new file mode 100644 index 00000000..dd362a01 --- /dev/null +++ b/src/content/docs/import/parquet.md @@ -0,0 +1,93 @@ +--- +title: Import Parquet +--- + +[Apache Parquet](https://Parquet.apache.org/docs/) is an open source, column-oriented persistent storage format +designed for efficient data storage and retrieval. Kùzu supports bulk data import from Parquet files +using the `COPY FROM` command. + +:::caution[Notes] +Parquet files store schema information in their metadata, so you don't need to explicitly handle columns +based on type, unlike in CSV. However, the same rules apply: + +- **Start with empty tables:** `COPY FROM` commands can be used when your tables are completely empty. +So you should use `COPY FROM` immediately after you define the schemas of your tables. +- **Copy nodes before relationships:** In order to copy a relationship table `R` from a Parquet file `RFile`, +the nodes that appear in `RFile` need to already exist in the database (either imported in bulk or +inserted through Cypher data manipulation commands). +::: + +## Import to node table + +Similar to CSV import, the order of columns in a Parquet file need to match the order of predefined +properties for node tables in the catalog, i.e. the order used when defining the schema of a node table. + +The following example is for a file named `user.parquet`. The output is obtained by using `print(pyarrow.Table)`. +```py +pyarrow.Table +name: string +age: int64 +---- +name: [["Adam","Karissa","Zhang","Noura"]] +age: [[30,40,50,25]] +``` + +To load this Parquet file into a `User` table, simply run: + +```cypher +COPY User FROM "user.Parquet"; +``` + +## Import to relationship table + +Similar to CSV import, the first two columns for a relationship file should the `from` and the `to` columns +that represent existing nodes' primary keys. + +The following example is for a file named `follows.parquet`. The output is obtained by using `print(pyarrow.Table)`. + +```py +pyarrow.Table +from: string +to: string +since: int64 +---- +from: [["Adam","Adam","Karissa","Zhang"]] +to: [["Karissa","Zhang","Zhang","Noura"]] +since: [[2020,2020,2021,2022]] +``` + +To load this Parquet file into a `Follows` table, simply run: + +```cypher +COPY Follows FROM "follows.Parquet"; +``` + +## Import multiple files to a single table + +It is common practice to divide large Parquet files into several smaller files for cleaner data management. +Kùzu can read multiple files with the same structure, consolidating their data into a single node or relationship table. +You can specify that multiple files are loaded in the following ways: + +### Glob pattern + +This is similar to the Unix [glob](https://man7.org/linux/man-pages/man7/glob.7.html) pattern, where you specify +file paths that match a given pattern. The following wildcard characters are supported: + +| Wildcard | Description | +| :-----------: | ----------- | +| `*` | match any number of any characters (including none) | +| `?` | match any single character | +| `[abc]` | match any one of the characters enclosed within the brackets | +| `[a-z]` | match any one of the characters within the range | + +```cypher +COPY User FROM "User*.parquet" +``` + +### List of files + +Alternatively, you can just specify a list of files to be loaded. + +```cypher +COPY User FROM ["User0.parquet", "User1.parquet", "User2.parquet"] +```