flowr-analysis · EagleoutIce · May 30, 2024 · May 30, 2024 · May 30, 2024 · May 30, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+*.heapsnapshot
 doc/
 *-flowr.log.*
 statistics-out*/

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -228,6 +228,7 @@
     "n-readlines": "^1.0.1",
     "n3": "^1.17.2",
     "object-hash": "^3.0.0",
+    "object-sizeof": "^2.6.4",
     "rotating-file-stream": "^3.1.1",
     "semver": "^7.5.4",
     "tar": "^7.1.0",

diff --git a/src/benchmark/slicer.ts b/src/benchmark/slicer.ts
@@ -3,7 +3,6 @@
  * @module
  */
 
-
 import type { IStoppableStopwatch } from './stopwatch'
 import { Measurements } from './stopwatch'
 import fs from 'fs'
@@ -16,6 +15,7 @@ import { PipelineExecutor } from '../core/pipeline-executor'
 import { guard } from '../util/assert'
 import { withoutWhitespace } from '../util/strings'
 import type {
+	BenchmarkMemoryMeasurement,
 	CommonSlicerMeasurements,
 	ElapsedTime,
 	PerSliceMeasurements,
@@ -33,6 +33,7 @@ import type { SlicingCriteriaFilter } from '../slicing/criterion/collect-all'
 import { collectAllSlicingCriteria } from '../slicing/criterion/collect-all'
 import { RType } from '../r-bridge/lang-4.x/ast/model/type'
 import { visitAst } from '../r-bridge/lang-4.x/ast/model/processing/visitor'
+import { getSizeOfDfGraph } from './stats/size-of'
 
 export const benchmarkLogger = log.getSubLogger({ name: 'benchmark' })
 
@@ -64,6 +65,7 @@ export interface BenchmarkSingleSliceStats extends MergeableRecord {
 	code:  ReconstructionResult
 }
 
+
 /**
  * A slicer that can be used to slice exactly one file (multiple times).
  * It holds its own {@link RShell} instance, maintains a cached dataflow and keeps measurements.
@@ -75,8 +77,9 @@ export interface BenchmarkSingleSliceStats extends MergeableRecord {
  */
 export class BenchmarkSlicer {
 	/** Measures all data recorded *once* per slicer (complete setup up to the dataflow graph creation) */
-	private readonly commonMeasurements = new Measurements<CommonSlicerMeasurements>()
+	private readonly commonMeasurements   = new Measurements<CommonSlicerMeasurements>()
 	private readonly perSliceMeasurements = new Map<SlicingCriteria, PerSliceStats>()
+	private readonly deltas               = new Map<CommonSlicerMeasurements, BenchmarkMemoryMeasurement>()
 	private readonly shell: RShell
 	private stats:          SlicerStats | undefined
 	private loadedXml:      string | undefined
@@ -164,6 +167,7 @@ export class BenchmarkSlicer {
 		this.stats = {
 			commonMeasurements:   new Map<CommonSlicerMeasurements, ElapsedTime>(),
 			perSliceMeasurements: this.perSliceMeasurements,
+			memory:               this.deltas,
 			request,
 			input:                {
 				numberOfLines:                             split.length,
@@ -181,7 +185,8 @@ export class BenchmarkSlicer {
 				numberOfNodes:               [...this.dataflow.graph.vertices(true)].length,
 				numberOfEdges:               numberOfEdges,
 				numberOfCalls:               numberOfCalls,
-				numberOfFunctionDefinitions: numberOfDefinitions
+				numberOfFunctionDefinitions: numberOfDefinitions,
+				sizeOfObject:                getSizeOfDfGraph(this.dataflow.graph)
 			}
 		}
 	}
@@ -251,10 +256,17 @@ export class BenchmarkSlicer {
 		expectedStep: Step,
 		keyToMeasure: CommonSlicerMeasurements
 	): Promise<PipelineStepOutputWithName<typeof DEFAULT_SLICING_PIPELINE, Step>> {
+		const memoryInit = process.memoryUsage()
 		const { result } = await this.commonMeasurements.measureAsync(
 			keyToMeasure, () => this.pipeline.nextStep(expectedStep)
 		)
-
+		const memoryEnd = process.memoryUsage()
+		this.deltas.set(keyToMeasure, {
+			heap:     memoryEnd.heapUsed - memoryInit.heapUsed,
+			rss:      memoryEnd.rss - memoryInit.rss,
+			external: memoryEnd.external - memoryInit.external,
+			buffs:    memoryEnd.arrayBuffers - memoryInit.arrayBuffers
+		})
 		return result as PipelineStepOutputWithName<typeof DEFAULT_SLICING_PIPELINE, Step>
 	}
 

diff --git a/src/benchmark/stats/print.ts b/src/benchmark/stats/print.ts
@@ -80,6 +80,18 @@ function printCountSummarizedMeasurements(stats: SummarizedMeasurement): string
 	return `${range} (median: ${stats.median}, mean: ${stats.mean}, std: ${stats.std})`
 }
 
+const units = ['bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB']
+
+// based on https://stackoverflow.com/a/39906526
+function convertNumberToNiceBytes(x: number){
+	let n = Math.abs(x)
+	let l = 0
+	while(n >= 1024 && ++l){
+		n = n/1024
+	}
+	return pad((x < 0 ? '-' : '') + n.toFixed(n < 10 && l > 0 ? 1 : 0) + ' ' + units[l])
+}
+
 /**
  * Converts the given stats to a human-readable string.
  * You may have to {@link summarizeSlicerStats | summarize} the stats first.
@@ -133,7 +145,8 @@ Dataflow:
   Number of nodes:            ${pad(stats.dataflow.numberOfNodes)}
   Number of edges:            ${pad(stats.dataflow.numberOfEdges)}
   Number of calls:            ${pad(stats.dataflow.numberOfCalls)}
-  Number of function defs:    ${pad(stats.dataflow.numberOfFunctionDefinitions)}`
+  Number of function defs:    ${pad(stats.dataflow.numberOfFunctionDefinitions)}
+  Size of graph:              ${convertNumberToNiceBytes(stats.dataflow.sizeOfObject)}`
 }
 
 export function ultimateStats2String(stats: UltimateSlicerStats): string {
@@ -173,7 +186,9 @@ Dataflow:
   Number of nodes:            ${formatSummarizedMeasure(stats.dataflow.numberOfNodes)}
   Number of edges:            ${formatSummarizedMeasure(stats.dataflow.numberOfEdges)}
   Number of calls:            ${formatSummarizedMeasure(stats.dataflow.numberOfCalls)}
-  Number of function defs:    ${formatSummarizedMeasure(stats.dataflow.numberOfFunctionDefinitions)}`
+  Number of function defs:    ${formatSummarizedMeasure(stats.dataflow.numberOfFunctionDefinitions)}
+  Size of graph:              ${formatSummarizedMeasure(stats.dataflow.sizeOfObject, convertNumberToNiceBytes)}
+`
 }
 
 function reduction2String(title: string, reduction: Reduction<SummarizedMeasurement>) {

diff --git a/src/benchmark/stats/size-of.ts b/src/benchmark/stats/size-of.ts
@@ -0,0 +1,72 @@
+import type { IEnvironment } from '../../dataflow/environments/environment'
+import { BuiltInEnvironment } from '../../dataflow/environments/environment'
+import type { DataflowGraph } from '../../dataflow/graph/graph'
+import type { DataflowGraphVertexInfo } from '../../dataflow/graph/vertex'
+import { VertexType } from '../../dataflow/graph/vertex'
+import type { Identifier, IdentifierDefinition } from '../../dataflow/environments/identifier'
+import sizeof from 'object-sizeof'
+
+/* we have to kill all processors linked in the default environment as they cannot be serialized and they are shared anyway */
+function killBuiltInEnv(env: IEnvironment | undefined): IEnvironment {
+	if(env === undefined) {
+		return undefined as unknown as IEnvironment
+	} else if(env.id === BuiltInEnvironment.id) {
+		/* in this case, the reference would be shared for sure */
+		return {
+			id:     env.id,
+			parent: killBuiltInEnv(env.parent),
+			memory: new Map<Identifier, IdentifierDefinition[]>()
+		}
+	}
+
+	const memory = new Map<Identifier, IdentifierDefinition[]>()
+	for(const [k, v] of env.memory) {
+		memory.set(k, v.filter(v => !v.kind.startsWith('built-in') && !('processor' in v)))
+	}
+
+	return {
+		id:     env.id,
+		parent: killBuiltInEnv(env.parent),
+		memory
+	}
+}
+
+/** Returns the size of the given df graph in bytes (without sharing in-memory) */
+export function getSizeOfDfGraph(df: DataflowGraph): number {
+	const verts = []
+	for(const [, v] of df.vertices(true)) {
+		let vertex: DataflowGraphVertexInfo = v
+		if(vertex.environment) {
+			vertex = {
+				...vertex,
+				environment: {
+					...vertex.environment,
+					current: killBuiltInEnv(v.environment.current)
+				}
+			} as DataflowGraphVertexInfo
+		}
+
+		if(vertex.tag === VertexType.FunctionDefinition) {
+			vertex = {
+				...vertex,
+				subflow: {
+					...vertex.subflow,
+					environment: {
+						...vertex.subflow.environment,
+						current: killBuiltInEnv(vertex.subflow.environment.current)
+					}
+				}
+			} as DataflowGraphVertexInfo
+		}
+
+		vertex = {
+			...vertex,
+			/* shared anyway by using constants */
+			tag: 0 as unknown
+		} as DataflowGraphVertexInfo
+
+		verts.push(vertex)
+	}
+
+	return sizeof([...verts, ...df.edges()])
+}
diff --git a/src/benchmark/stats/stats.ts b/src/benchmark/stats/stats.ts
@@ -2,6 +2,7 @@ import type { SingleSlicingCriterion, SlicingCriteria } from '../../slicing/crit
 import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'
 import type { ReconstructionResult } from '../../reconstruct/reconstruct'
 import type { RParseRequestFromFile, RParseRequestFromText } from '../../r-bridge/retriever'
+import type { MergeableRecord } from '../../util/objects'
 
 export const CommonSlicerMeasurements = ['initialize R session', 'retrieve AST from R code', 'normalize R AST', 'produce dataflow information', 'close R session', 'total'] as const
 export type CommonSlicerMeasurements = typeof CommonSlicerMeasurements[number]
@@ -38,6 +39,24 @@ export interface SlicerStatsDataflow<T = number> {
 	numberOfEdges:               T
 	numberOfCalls:               T
 	numberOfFunctionDefinitions: T
+	/* size of object in bytes as measured by v8 serialization */
+	sizeOfObject:                T
+}
+
+
+/**
+ * Please note, that these measurement can be negative as there is no guarantee that the memory usage will increase
+ * due to, e.g., garbage collection.
+*/
+export interface BenchmarkMemoryMeasurement<T = number> extends MergeableRecord {
+	/* used heap memory delta as reported by the node process in bytes */
+	heap:     T
+	/* resident set size delta as reported by the node process in bytes */
+	rss:      T
+	/* external memory delta as reported by the node process in bytes */
+	external: T
+	/* (array) buffer memory delta as reported by the node process in bytes */
+	buffs:    T
 }
 
 /**
@@ -46,6 +65,7 @@ export interface SlicerStatsDataflow<T = number> {
 export interface SlicerStats {
 	commonMeasurements:   Map<CommonSlicerMeasurements, ElapsedTime>
 	perSliceMeasurements: Map<SlicingCriteria, PerSliceStats>
+	memory:               Map<CommonSlicerMeasurements, BenchmarkMemoryMeasurement>,
 	request:              RParseRequestFromFile | RParseRequestFromText
 	input:                SlicerStatsInput
 	dataflow:             SlicerStatsDataflow

diff --git a/src/benchmark/summarizer/first-phase/input.ts b/src/benchmark/summarizer/first-phase/input.ts
@@ -6,7 +6,7 @@ import { guard } from '../../../util/assert'
 import { escape } from '../../../util/ansi'
 import { jsonReplacer } from '../../../util/json'
 import { readLineByLineSync } from '../../../util/files'
-import type { CommonSlicerMeasurements, PerSliceMeasurements, PerSliceStats, SlicerStats } from '../../stats/stats'
+import type { BenchmarkMemoryMeasurement, CommonSlicerMeasurements, PerSliceMeasurements, PerSliceStats, SlicerStats } from '../../stats/stats'
 import type { SlicingCriteria } from '../../../slicing/criterion/parse'
 import { stats2string } from '../../stats/print'
 
@@ -27,9 +27,13 @@ export async function processRunMeasurement(line: Buffer, fileNum: number, lineN
 		'file-id': got['file-id'],
 		'run-num': got['run-num'],
 		stats:     {
-			input:              got.stats.input,
-			request:            got.stats.request,
-			dataflow:           got.stats.dataflow,
+			input:    got.stats.input,
+			request:  got.stats.request,
+			dataflow: got.stats.dataflow,
+			memory:   new Map(
+				(got.stats.memory as unknown as [CommonSlicerMeasurements, BenchmarkMemoryMeasurement][])
+					.map(([k, v]) => [k, v])
+			),
 			commonMeasurements: new Map(
 				(got.stats.commonMeasurements as unknown as [CommonSlicerMeasurements, string][])
 					.map(([k, v]) => {
@@ -49,6 +53,9 @@ export async function processRunMeasurement(line: Buffer, fileNum: number, lineN
 	let atSliceNumber = 0
 	const summarized  = await summarizeSlicerStats(got.stats, (criterion, stats) => {
 		console.log(`${escape}1F${escape}1G${escape}2K    [${++atSliceNumber}/${totalSlices}] Summarizing ${JSON.stringify(criterion)} (reconstructed has ${stats.reconstructedCode.code.length} characters)`)
+		if(stats.reconstructedCode.code.length < 50) {
+			console.log(`Reconstructed code: ${stats.reconstructedCode.code}`)
+		}
 	})
 
 	console.log(`    - Append raw summary to ${outputPath}`)

diff --git a/src/benchmark/summarizer/first-phase/process.ts b/src/benchmark/summarizer/first-phase/process.ts
@@ -222,6 +222,7 @@
 }
 
 export function summarizeSummarizedMeasurement(data: SummarizedMeasurement[]): SummarizedMeasurement {
+	data = data.filter(isNotUndefined)
 	const min = data.map(d => d.min).filter(isNotUndefined).reduce((a, b) => Math.min(a, b), Infinity)
 	const max = data.map(d => d.max).filter(isNotUndefined).reduce((a, b) => Math.max(a, b), -Infinity)
 	// calculate median of medians (don't just average the median!)

diff --git a/src/benchmark/summarizer/second-phase/process.ts b/src/benchmark/summarizer/second-phase/process.ts
@@ -5,6 +5,7 @@ import type { SummarizedMeasurement } from '../../../util/summarizer'
 import { summarizeMeasurement } from '../../../util/summarizer'
 import { guard } from '../../../util/assert'
 import type {
+	BenchmarkMemoryMeasurement,
 	SlicerStatsDataflow,
 	SlicerStatsInput
 } from '../../stats/stats'
@@ -16,6 +17,7 @@ import {
 export function summarizeAllSummarizedStats(stats: SummarizedSlicerStats[]): UltimateSlicerStats {
 	const commonMeasurements = new DefaultMap<CommonSlicerMeasurements, number[]>(() => [])
 	const perSliceMeasurements = new DefaultMap<PerSliceMeasurements, SummarizedMeasurement[]>(() => [])
+	const memory = new DefaultMap<CommonSlicerMeasurements, BenchmarkMemoryMeasurement[]>(() => [])
 	const reductions: Reduction<SummarizedMeasurement>[] = []
 	const reductionsNoFluff: Reduction<SummarizedMeasurement>[] = []
 	const inputs: SlicerStatsInput[] = []
@@ -31,6 +33,9 @@ export function summarizeAllSummarizedStats(stats: SummarizedSlicerStats[]): Ult
 		for(const [k, v] of stat.perSliceMeasurements.measurements) {
 			perSliceMeasurements.get(k).push(v)
 		}
+		for(const [k, v] of stat.memory) {
+			memory.get(k).push(v)
+		}
 		reductions.push(stat.perSliceMeasurements.reduction)
 		reductionsNoFluff.push(stat.perSliceMeasurements.reductionNoFluff)
 		inputs.push(stat.input)
@@ -69,19 +74,19 @@ export function summarizeAllSummarizedStats(stats: SummarizedSlicerStats[]): Ult
 			numberOfNodes:               summarizeMeasurement(dataflows.map(d => d.numberOfNodes)),
 			numberOfFunctionDefinitions: summarizeMeasurement(dataflows.map(d => d.numberOfFunctionDefinitions)),
 			numberOfCalls:               summarizeMeasurement(dataflows.map(d => d.numberOfCalls)),
-			numberOfEdges:               summarizeMeasurement(dataflows.map(d => d.numberOfEdges))
+			numberOfEdges:               summarizeMeasurement(dataflows.map(d => d.numberOfEdges)),
+			sizeOfObject:                summarizeMeasurement(dataflows.map(d => d.sizeOfObject))
 		}
 	}
 }
 
 export function summarizeAllUltimateStats(stats: UltimateSlicerStats[]): UltimateSlicerStats {
 	return {
 		// these should be deterministic, so we don't technically need to use max, but we do just in case something unexpected happens :)
-		totalRequests:     Math.max(...stats.map(s => s.totalRequests)),
-		totalSlices:       Math.max(...stats.map(s => s.totalSlices)),
-		failedToRepParse:  Math.max(...stats.map(s => s.failedToRepParse)),
-		timesHitThreshold: Math.max(...stats.map(s => s.timesHitThreshold)),
-
+		totalRequests:        Math.max(...stats.map(s => s.totalRequests)),
+		totalSlices:          Math.max(...stats.map(s => s.totalSlices)),
+		failedToRepParse:     Math.max(...stats.map(s => s.failedToRepParse)),
+		timesHitThreshold:    Math.max(...stats.map(s => s.timesHitThreshold)),
 		// average out / summarize other measurements
 		commonMeasurements:   new Map(CommonSlicerMeasurements.map(m => [m, summarizeSummarizedMeasurement(stats.map(s => s.commonMeasurements.get(m) as SummarizedMeasurement))])),
 		perSliceMeasurements: new Map(PerSliceMeasurements.map(m => [m, summarizeSummarizedMeasurement(stats.map(s => s.perSliceMeasurements.get(m) as SummarizedMeasurement))])),
@@ -103,7 +108,8 @@ export function summarizeAllUltimateStats(stats: UltimateSlicerStats[]): Ultimat
 			numberOfNodes:               summarizeSummarizedMeasurement(stats.map(s => s.dataflow.numberOfNodes)),
 			numberOfFunctionDefinitions: summarizeSummarizedMeasurement(stats.map(s => s.dataflow.numberOfFunctionDefinitions)),
 			numberOfCalls:               summarizeSummarizedMeasurement(stats.map(s => s.dataflow.numberOfCalls)),
-			numberOfEdges:               summarizeSummarizedMeasurement(stats.map(s => s.dataflow.numberOfEdges))
+			numberOfEdges:               summarizeSummarizedMeasurement(stats.map(s => s.dataflow.numberOfEdges)),
+			sizeOfObject:                summarizeSummarizedMeasurement(stats.map(s => s.dataflow.sizeOfObject))
 		}
 	}
 }
@@ -112,9 +118,13 @@ export function processNextSummary(line: Buffer, allSummarized: SummarizedSlicer
 	let got = JSON.parse(line.toString()) as { summarize: SummarizedSlicerStats }
 	got = {
 		summarize: {
-			input:              got.summarize.input,
-			request:            got.summarize.request,
-			dataflow:           got.summarize.dataflow,
+			input:    got.summarize.input,
+			request:  got.summarize.request,
+			dataflow: got.summarize.dataflow,
+			memory: 		new Map(
+				(got.summarize.memory as unknown as [CommonSlicerMeasurements, BenchmarkMemoryMeasurement][])
+					.map(([k, v]) => [k, v])
+			),
 			commonMeasurements: new Map(
 				(got.summarize.commonMeasurements as unknown as [CommonSlicerMeasurements, string][])
 					.map(([k, v]) => {