Merge pull request #153 from ldbc/dev

Release 0.3.1
ldbc · Nov 24, 2019 · e10de9a · e10de9a
2 parents 3766473 + bc9fcc6
commit e10de9a
Show file tree

Hide file tree

Showing 362 changed files with 6,817 additions and 11,589 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,7 +6,7 @@
 *.toc
 *.fdb_latexmk
 *.fls
-*.synctex.gz*
+*.synctex*
 *.bbl
 *.blg
 ldbc-snb-specification.pdf
@@ -16,3 +16,4 @@ Thumbs.db
 .idea/
 *.~*
 *.*~
+/*.pdf
diff --git a/.travis.yml b/.travis.yml
@@ -8,8 +8,7 @@ env:
     - REPOSITORY_SNAPSHOT=github.com/ldbc/ldbc_snb_docs_snapshot.git
     - secure: H/QaDPPacT6wJsGNyDbD934OjGVBVx7XhR/lz+fe/8709sWLy/r7Bzc5sJIW7RezQTaCRd/6pSLwvwvnRBWN0Cd83Pxn5aLwfLq1v8m0fElMEdG0jQHXpYXfUyhcemTH0pvc4BA0RBET00xid08TSuXek5Olys700s5SMJ89St7hbrRiW4cMdS5yWI6tzVjC+ndv+JjeHABcqXf+IIQqdv+FZ/845x7mQqaLs6u70pQaIWEf5ktT1Qqj7WEuGpSOg3M+Ipn870T9msK6QUsOlj1+p3AcVY79F2Ybt+X7fghRLZGH/F970nYa0t1tGR3T99WQFxUxJ22GS+jTfyx5o2eVBRL9zYUsLmTy01Y3/IO9vC16yNBh8KviWfNobvCKmpqGdOalJ7UvyIJ6Tj4fQ/ytgonDGCzY5MiwPPvyJRwAKwtWTIo3EcV8DoE7lc1uwHTqsOX7CtyyMm4s4RSdXMfcLbYST4TgS+hKnOmOBywTtOKRR2a4Odgc9ZQeKA1rHcdv8rJQJ6uJANymJxUJZEj4FyZmOz1NNJMQENVBTBhPdc1rKoyOQ8KpSHdAXOCfe1KmVmcgFQYMTcEfnfWGpOQgWOtBjP2SwSyt8YE10wHLkHcpNQSagsGqs4z6NGNMW8wBzg4fkZlUMyYxA/rVm2i9HpzhVnvZYfgKtHGcTwk=
 script:
-  - docker build . --tag ldbc/docs
-  - docker run -v `pwd`/:/mnt/ ldbc/docs /bin/bash -c "cd /mnt/ && ./generate-tex.py && make generate_query_cards texfot compile_query_cards"
+  - docker run -v `pwd`/:/mnt/ ldbc/docs /bin/bash -c "cd /mnt/ && ./generate-tex.py && make generate_query_cards texfot compile_query_cards workloads"
   - mkdir out
   - if [ "$TRAVIS_BRANCH" = "master" -a "$TRAVIS_PULL_REQUEST" = "false" ]; then export GH_REF=$REPOSITORY_STABLE;   deployment/deploy.sh; fi
   - if [ "$TRAVIS_BRANCH" = "dev"    -a "$TRAVIS_PULL_REQUEST" = "false" ]; then export GH_REF=$REPOSITORY_SNAPSHOT; deployment/deploy.sh; fi

diff --git a/Makefile b/Makefile
@@ -1,8 +1,11 @@
 DOCUMENT=ldbc-snb-specification.tex
 
-all: $(DOCUMENT)
+spec: $(DOCUMENT)
 	latexmk -pdf --interaction=batchmode $(DOCUMENT)
 
+all: generate_query_cards compile_query_cards workloads texfot
+	ls *.pdf
+
 generate_query_cards: $(DOCUMENT)
 	./generate-tex.py
 
@@ -12,6 +15,11 @@ compile_query_cards: $(DOCUMENT)
 		../texfot.pl latexmk -pdf --interaction=batchmode $$card ; \
 	done
 
+workloads: $(DOCUMENT)
+	for doc in workload-*.tex; do \
+		./texfot.pl latexmk -pdf --interaction=batchmode $$doc ; \
+	done
+
 texfot: $(DOCUMENT)
 	./texfot.pl latexmk -pdf --interaction=batchmode $(DOCUMENT)
 

diff --git a/README.md b/README.md
@@ -15,11 +15,13 @@ The documentation of the [LDBC Graphalytics benchmark](https://graphalytics.org)
 
 ## How to cite LDBC benchmarks
 
-* **LDBC Semantic Publishing Benchmark:** [Benchmarking RDF Query Engines: The LDBC Semantic Publishing Benchmark](http://ceur-ws.org/Vol-1700/paper-01.pdf), BLINK at ISWC 2016 by V. Kotsev et al. [[bib](bib/spb.bib)]
-* **LDBC Graphalytics:** [LDBC Graphalytics: A Benchmark for Large-Scale Graph Analysis on Parallel and Distributed Platforms](http://www.vldb.org/pvldb/vol9/p1317-iosup.pdf) VLDB 2016 by A. Iosup et al. [[bib](bib/graphalytics.bib)]
-* **LDBC Social Network Benchmark – Interactive workload:** [The LDBC Social Network Benchmark: Interactive Workload](https://homepages.cwi.nl/~boncz/snb-challenge/snb-sigmod.pdf), SIGMOD 2015 by O. Erling et al. [[bib](bib/snb-interactive.bib)]
-* **LDBC Social Network Benchmark – BI workload:** [An early look at the LDBC Social Network Benchmark's Business Intelligence workload](http://ldbcouncil.org/sites/default/files/ldbc-bi-grades.pdf), GRADES-NDA at SIGMOD 2018 by G. Szárnyas et al. [[bib](bib/snb-bi.bib)]
-* **The full specification:** [The LDBC Social Network Benchmark (version 0.4.0-SNAPSHOT)](https://ldbc.github.io/ldbc_snb_docs/ldbc-snb-specification.pdf) by LDBC Social Network Benchmark task force, 2019. [[bib](bib/specification.bib)]
+* **Social Network Benchmark**
+  * **Detailed specification:** [The LDBC Social Network Benchmark (version 0.3.1)](https://ldbc.github.io/ldbc_snb_docs/ldbc-snb-specification.pdf) by the LDBC Social Network Benchmark task force, 2019. [[bib](bib/specification.bib)]
+  * **BI workload:** [An early look at the LDBC Social Network Benchmark's Business Intelligence workload](http://ldbcouncil.org/sites/default/files/ldbc-bi-grades.pdf), GRADES-NDA at SIGMOD 2018 by G. Szárnyas et al. [[bib](bib/snb-bi.bib)]
+  * **Interactive workload:** [The LDBC Social Network Benchmark: Interactive Workload](https://homepages.cwi.nl/~boncz/snb-challenge/snb-sigmod.pdf), SIGMOD 2015 by O. Erling et al. [[bib](bib/snb-interactive.bib)]
+* **Other LDBC benchmarks**
+  * **LDBC Graphalytics:** [LDBC Graphalytics: A Benchmark for Large-Scale Graph Analysis on Parallel and Distributed Platforms](http://www.vldb.org/pvldb/vol9/p1317-iosup.pdf) VLDB 2016 by A. Iosup et al. [[bib](bib/graphalytics.bib)]
+  * **LDBC Semantic Publishing Benchmark:** [Benchmarking RDF Query Engines: The LDBC Semantic Publishing Benchmark](http://ceur-ws.org/Vol-1700/paper-01.pdf), BLINK at ISWC 2016 by V. Kotsev et al. [[bib](bib/spb.bib)]
 
 ## How to build the this document
 
@@ -34,6 +36,7 @@ To get consistent formatting, query cards are generated from query specification
     ```bash
     sudo apt-get install -y pandoc
     sudo apt-get install -y python3 python3-pip python3-setuptools
+    sudo pip3 install -r requirements.txt
     ```
 
 1. To generate the TeX files for query cards, run the following command:
@@ -46,20 +49,20 @@ To get consistent formatting, query cards are generated from query specification
 
 To build the document, run `make` or `make texfot`. The latter requires Perl but gives you a cleaner output.
 
-We also provide a Dockerfile for building the document. To create the Docker image, run the following command:
+We also provide [an image on Docker Hub](https://hub.docker.com/r/ldbc/docs) for building the document. To use it, run:
 
-```console
-docker build . --tag ldbc/docs
-```
-
-Once the image is created, you can compile the document by issuing:
-
-```console
+```bash
 docker run -v `pwd`/:/mnt/ ldbc/docs /bin/bash -c \
   "cd /mnt/ && ./generate-tex.py && make generate_query_cards texfot compile_query_cards"; \
   sudo chown -R $USER:$USER .
 ```
 
+You can also compile the image manually by issuing:
+
+```bash
+docker build . --tag ldbc/docs
+```
+
 ### Notations and conventions
 
 #### Naming conventions

diff --git a/benchmark-specification.tex b/benchmark-specification.tex
@@ -28,9 +28,9 @@ \section{Requirements}
 \section{Software and Useful Links}
 
 \begin{itemize}
-	\item \textbf{LDBC Driver 0.3 -- \url{https://github.com/ldbc/ldbc_driver}}: The driver
+	\item \textbf{LDBC Driver -- \url{https://github.com/ldbc/ldbc_driver}}: The driver
 	responsible for executing the \ldbcsnb workload.
-	\item \textbf{\datagen 0.2.5 -- \url{https://github.com/ldbc/ldbc_snb_datagen}}: The data
+	\item \textbf{\datagen -- \url{https://github.com/ldbc/ldbc_snb_datagen}}: The data
 	generator used to generate the datasets of the benchmark.
 \end{itemize}
 
@@ -46,7 +46,7 @@ \section{Data}
 factors.
 
 \subsection{Data Types}
-\autoref{table:types} describes the different types used in the whole benchmark.
+\autoref{table:types} describes the different data types used in the benchmark.
 
 \begin{table}[h]
 \centering
@@ -114,7 +114,7 @@ \subsection{Data Schema}
 
 {\flushleft \textbf{Textual Restrictions and Notes}}
 \begin{itemize}
-    \item Posts have content or imageFile. They have one of them but not both. The one they do not have is an empty string.
+    \item Posts have content or imageFile. They have one of them but not both. The one they do not have is an empty string.\footnote{Implementation can use other means to represent this value such as a \texttt{NULL}.}
     \item Posts in a forum can be created by a non-member person if and only if that person is a modeartor.
 \end{itemize}
 
@@ -298,9 +298,7 @@ \subsubsection{Entities}
     \label{table:tag}
 \end{table}
 
-{\flushleft \textbf{TagClass:}} a class or a category used to build
-a hierarchy of tags. \autoref{table:tagclass} shows the attributes of TagClass
-entity.
+{\flushleft \textbf{TagClass:}} a class used to build a hierarchy of tags. \autoref{table:tagclass} shows the attributes of TagClass entity.
 
 \begin{table}[H]
     \begin{tabular}{|>{\varNameCell}p{\attributeColumnWidth}|>{\typeCell}p{\typeColumnWidth}|p{\descriptionColumnWidth}|}
@@ -603,10 +601,9 @@ \subsubsection{Implementation Details}
 correlation dimension.  Finally, the last reducer generates the remaining
 information such as forums, posts and comments.
 
-
 \subsection{Output Data}
 
-\datagen produces outputs three different items:
+For each scale factor, \datagen produces three different artefacts:
 \begin{itemize}
   \item \textbf{Dataset}: The dataset to be bulk loaded by the SUT. It
     corresponds to roughly the 90\% of the total generated network.
@@ -619,105 +616,115 @@ \subsection{Output Data}
     read queries of the workloads.
 \end{itemize}
 
-The SUT has to take care only of the generated Dataset to be bulk loaded.
-The formats currently supported by \datagen are the following:
-
-\begin{itemize}
-  \item \textbf{CSV variants:}
-    \begin{itemize}
-      \item \textbf{CsvBasic:} Data output in CSV format, one file per different entity and on file
-        per different relation. Also, there is a file for those attributes whose
-        cardinality is larger than one, \ie \texttt{Person.email}, \texttt{Person.speaks}.
-      \item \textbf{CsvMergeForeign:} Similar to the CSV format, but the 
-        relations of the form 1-to-1 and 1-to-N are stored in the entity files as
-        a foreign keys.
-      \item \textbf{CsvComposite:} Similar to the CsvBasic format, but uses composite attributes for storing the \texttt{Person.email}, and \texttt{Person.speaks} attributes.
-      \item \textbf{CsvCompositeMergeForeign:} Has the traits of both the CsvComposite and the CsvMergeForeign formats.
-    \end{itemize}
-  \item \textbf{Turtle:} Dataset in Turtle format for RDF systems.
-\end{itemize}
+\subsubsection{Scale Factors}
 
+\ldbcsnb defines a set of scale factors (SFs), targeting systems of different sizes and budgets.
+SFs are computed based on the ASCII size in Gigabytes of the generated output files using the CsvBasic serializer.
+For example, SF 1 takes roughly 1~GB in CSV format, SF 3 weights roughly 3~GB and so on and so forth.
+The proposed SFs are the following: 1, 3, 10, 30, 100, 300, 1000.
+Additionally, two small SFs, 0.1 and 0.3 are provided to help initial validation efforts.
 
+The Test Sponsor may select the SF that better fits their needs, by properly configuring the \datagen, as described in \autoref{section:data_generation}.
+The size of the resulting dataset is mainly affected by the following configuration parameters: the number of persons and the number of years simulated.
+By default, all SFs are defined over a period of three years, starting from 2010, and SFs are computed by scaling the number of Persons in the network.
+\autoref{tab:snsize} shows some metrics of SFs 0.1 \ldots 1000.
 
-\subsubsection{CsvBasic}
+\begin{table}[H]
+	\centering
+	\begin{tabular}{|l||r|r|r|r|r|r|r|r|r|}
+		\hline
+		Scale Factor     &     0.1 &     0.3 &        1 &        3 &        10 &        30 &        100 &        300 &        1000 \\ \hline\hline
+		\# of Persons    &    1.5K &    3.5K &      11K &      27K &       73K &      182K &       499K &      1.25M &        3.6M \\ \hline\hline
+		%Number of nodes &  327588 &  908224 &  3181724 &  9281922 &  29987835 &  88789833 &  282637871 &  817316105 &  2686781095 \\ \hline
+		%Number of edges & 1477965 & 4583118 & 17256038 & 52695735 & 176623445 & 540915404 & 1775513811 & 5269286554 & 17788734714 \\ \hline
+		\# of nodes      &  327.6K &    908K &     3.2M &     9.3M &       30M &     88.8M &     282.6M &     817.3M &        2.7B \\ \hline
+		\# of edges      &    1.5M &    4.6M &    17.3M &    52.7M &    176.6M &    540.9M &       1.8B &       5.3B &         17B \\ \hline
+	\end{tabular}
+	\centering
+	\caption{Parameters of each scale factor.}
+	\label{tab:snsize}
+\end{table}
 
-This is a comma separated format. Each entity, relation and properties with a
-cardinality larger than one, are output in a separate file. Generated files are
-summarized at \autoref{table:csv_basic}.  Depending on the number of threads used
-for generating the dataset, the number of files varies, since there is a file
-generated per thread. The * in the file names indicates a number between 0 and
-$\mathsf{NumberOfThreads}-1$.
+%In \autoref{appendix:scale_factors}, we show the statistics of each of the
+%proposed SFs in detail, including distributions for some of the relations.
 
 \input{table-csv-basic}
 
-
-\subsubsection{CsvMergeForeign}
-
-This is a comma separated format. It is similar to CSV, but those relations
-connecting two entities A and B, where an entity A has a cardinality of one, A
-is output as a column of entity B. Generated files are summarized at
-\autoref{table:csv_merge_foreign}. Depending on the number of threads used for generating
-the dataset, the number of files varies, since there is a file generated per
-thread. The * in the file names indicates a number between 0 and $\mathsf{NumberOfThreads}-1$.
-
 \input{table-csv-merge-foreign}
 
-\subsubsection{CsvComposite}
+\input{table-csv-composite}
 
-TODO -- \autoref{table:csv_composite}.
+\input{table-csv-composite-merge-foreign}
 
-\input{table-csv-composite}
+\subsubsection{Dataset}
 
-\subsubsection{CsvCompositeMergeForeign}
+The datasets are generated in the \texttt{social\_network/} directory, split into static and dynamic parts (\autoref{figure:schema}).
+The SUT has to take care only of the generated Dataset to be bulk loaded. Using \texttt{NULL} values for storing optional values is allowed.
 
-TODO -- \autoref{table:csv_composite_merge_foreign}.
+The formats currently supported by \datagen serializers are the following:
 
-\input{table-csv-composite-merge-foreign}
+\begin{itemize}
+  \item \textbf{CSV variants:}
+      These serializers produce CSV-like text files which use the pipe character ``\texttt{|}'' as the primary field separator and the semicolon character ``\texttt{;}'' as a separator for multi-valued attributes (where applicable).
+      The CSV files are stored in two subdirectories: \texttt{static/} and \texttt{dynamic/}.
+      Depending on the number of threads used for generating the dataset, the number of files varies, since there is a file generated per thread. We indicate with ``\texttt{*}'' in the specification.
+      Currently, the following CSV variants are supported:
+    \begin{itemize}
+      \item \textbf{CsvBasic:}
+      Each entity, relation and attribute with a cardinality larger than one (including attributes \texttt{Person.email} and \texttt{Person.speaks}), are output in a separate file.
+      Generated files are summarized in \autoref{table:csv_basic}.
+
+      \item \textbf{CsvMergeForeign:}
+      This serializer is similar to CsvBasic, but relations that have a cardinality of 1-to-1 or 1-to-N are merged in the entity files as a foreign keys.
+      Generated files are summarized in \autoref{table:csv_merge_foreign}.
+
+      \item \textbf{CsvComposite:}
+      Similar to the CsvBasic format but each entity, and relations with a cardinality larger than one, are output in a separate file.
+      Multi-valued attributes (\texttt{Person.email} and \texttt{Person.speaks}) are stored as composite values.
+      Generated files are summarized in \autoref{table:csv_composite}.
+
+      \item \textbf{CsvCompositeMergeForeign:}
+      Has the traits of both the CsvComposite and the CsvMergeForeign formats.
+      Multi-valued attributes (\texttt{Person.email} and \texttt{Person.speaks}) are stored as composite values.
+      Generated files are summarized in \autoref{table:csv_composite_merge_foreign}.
+    \end{itemize}
+  \item \textbf{Turtle:} Dataset in Turtle format for RDF systems.
+\end{itemize}
 
+\paragraph{Turtle}
 
-\subsubsection{Turtle}
+This serializers uses the standard Turtle format\footnote{Description of
+the Turtle RDF format http://www.w3.org/TR/turtle/}. It outputs
+two files: \texttt{0\_ldbc\_socialnet\_static\_dbp.ttl} and \texttt{0\_ldbc\_socialnet.ttl}, containing the static and the dynamic part of the graph, respectively.
 
-This is the standard Turtle\footnote{Description of
-the Turtle RDF format http://www.w3.org/TR/turtle/} format. \datagen outputs
-two files: \texttt{0\_ldbc\_socialnet\_static\_dbp.ttl} and \texttt{0\_ldbc\_socialnet.ttl}.
+\subsubsection{Update Streams}
 
-\subsection{Scale Factors}
+\input{table-update-streams}
 
-\ldbcsnb defines a set of scale factors (SFs), targeting systems of different
-sizes and budgets.  SFs are computed based on the ASCII size in Gigabytes of
-the generated output files using the CSV serializer. For example, SF 1 weighs roughly 1~GB in CSV
-format, SF 3 weights roughly 3~GB and so on and so forth.  The proposed SFs are
-the following: 1, 3, 10, 30, 100, 300, 1000.
-Additionally, two small SFs, 0.1 and 0.3 are provided to help initial validation efforts.
+The generic schema is given in \autoref{table:update_stream_generic_schema}, while the concrete schema of each insert operation is given in \autoref{table:update_stream_schemas}.
+The update stream files are generated in the \texttt{social\_network/} directory and are grouped as follows:
 
-The Test Sponsor may select the SF
-that better fits their needs, by properly configuring the \datagen,
-as described in \autoref{section:data_generation}.
-The size of the resulting dataset, is mainly affected by the following
-configuration parameters: the number of persons and the number of years
-simulated. Different SFs are computed by scaling the number of Persons in
-the network, while fixing the number of years simulated.
-\autoref{tab:snsize} shows the parameters used in each of the SFs.
+\begin{itemize}
+	\item \texttt{updateStream\_*\_person.csv} contains operation 1: \queryRefCard{interactive-insert-01}{II}{1}
+	\item \texttt{updateStream\_*\_forum.csv} contains operations 2--8: %
+	\queryRefCard{interactive-insert-02}{II}{2}
+	\queryRefCard{interactive-insert-03}{II}{3}
+	\queryRefCard{interactive-insert-04}{II}{4}
+	\queryRefCard{interactive-insert-05}{II}{5}
+	\queryRefCard{interactive-insert-06}{II}{6}
+	\queryRefCard{interactive-insert-07}{II}{7}
+	\queryRefCard{interactive-insert-08}{II}{8}
+\end{itemize}
 
-\begin{table}[H]
-\centering
-\begin{tabular}{|c||r|r|r|r|r|r|r|r|r|}
-	\hline
-	Scale Factor  &  0.1 &  0.3 &    1 &    3 &   10 &   30 &  100 &   300 & 1000 \\ \hline
-	\# of Persons & 1.5K & 3.5K &  11K &  27K &  73K & 182K & 499K & 1.25M & 3.6M \\ \hline
-	 \# of Years  &    3 &    3 &    3 &    3 &    3 &    3 &    3 &     3 &    3 \\ \hline
-	 Start Year   & 2010 & 2010 & 2010 & 2010 & 2010 & 2010 & 2010 &  2010 & 2010 \\ \hline
-\end{tabular}
-\centering
-\caption{Parameters of each scale factor.}
-\label{tab:snsize}
-\end{table}
+Remark: update streams in version 0.3.0 only contain inserts. Delete operations are being designed and will be released in 2020.
 
-For example, SF 30 consists of the activity of a social network of 182K users
-during a period of three years, starting from 2010.
+\subsubsection{Substitution Parameters}
 
-%In \autoref{appendix:scale_factors}, we show the statistics of each of the
-%proposed SFs in detail, including distributions for some of the relations.
+The substitution parameters are generated in the \texttt{substitution\_parameters/} directory.
+Each parameter file is named \texttt{\{interactive|bi\}\_<id>\_param.txt}, corresponding to an operation of
+Interactive complex reads (\queryRefCard{interactive-complex-read-01}{IC}{1}--\queryRefCard{interactive-complex-read-14}{IC}{14}) and
+BI reads (\queryRefCard{bi-read-01}{BI}{1}--\queryRefCard{bi-read-25}{BI}{25}).
+The schemas of these files are defined by the operator, e.g. the schema of \queryRefCard{interactive-complex-read-01}{IC}{1} is ``\texttt{personId|firstName}''.
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -728,7 +735,7 @@ \section{Benchmark Workflow}
 
 The workflow of the benchmark is shown in \autoref{figure:workflow}.
 
-\begin{figure}[htbp]
+\begin{figure}[htb]
 	\centering
 	\includegraphics[width=\linewidth]{figures/workflow}
 	\caption{Benchmark workflow.}