Skip to content

Commit

Permalink
apacheGH-39257: [JS] LargeBinary (apache#39258)
Browse files Browse the repository at this point in the history
Merge after apache#39249
* Closes: apache#39257
  • Loading branch information
domoritz authored Dec 18, 2023
1 parent 9c097d5 commit 4ec6544
Show file tree
Hide file tree
Showing 32 changed files with 191 additions and 68 deletions.
2 changes: 1 addition & 1 deletion docs/source/status.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ Data Types
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Binary |||||||||
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Large Binary |||| | ||| |
| Large Binary |||| | ||| |
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
| Utf8 |||||||||
+-------------------+-------+-------+-------+------------+-------+-------+-------+-------+
Expand Down
4 changes: 2 additions & 2 deletions js/src/Arrow.dom.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ export {
Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64,
Float, Float16, Float32, Float64,
Utf8, LargeUtf8,
Binary,
Binary, LargeBinary,
FixedSizeBinary,
Date_, DateDay, DateMillisecond,
Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond,
Expand Down Expand Up @@ -78,7 +78,7 @@ export {
} from './Arrow.js';

export {
BinaryBuilder,
BinaryBuilder, LargeBinaryBuilder,
BoolBuilder,
DateBuilder, DateDayBuilder, DateMillisecondBuilder,
DecimalBuilder,
Expand Down
3 changes: 2 additions & 1 deletion js/src/Arrow.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ export {
Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64,
Float, Float16, Float32, Float64,
Utf8, LargeUtf8,
Binary,
Binary, LargeBinary,
FixedSizeBinary,
Date_, DateDay, DateMillisecond,
Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond,
Expand Down Expand Up @@ -80,6 +80,7 @@ export { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, Dur
export { Utf8Builder } from './builder/utf8.js';
export { LargeUtf8Builder } from './builder/largeutf8.js';
export { BinaryBuilder } from './builder/binary.js';
export { LargeBinaryBuilder } from './builder/largebinary.js';
export { ListBuilder } from './builder/list.js';
export { FixedSizeListBuilder } from './builder/fixedsizelist.js';
export { MapBuilder } from './builder/map.js';
Expand Down
6 changes: 3 additions & 3 deletions js/src/builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import {
DataType, strideForType,
Float, Int, Decimal, FixedSizeBinary,
Date_, Time, Timestamp, Interval, Duration,
Utf8, LargeUtf8, Binary, List, Map_,
Utf8, LargeUtf8, Binary, LargeBinary, List, Map_,
} from './type.js';
import { createIsValidFunction } from './builder/valid.js';
import { BufferBuilder, BitmapBufferBuilder, DataBufferBuilder, OffsetsBufferBuilder } from './builder/buffer.js';
Expand Down Expand Up @@ -285,7 +285,7 @@ export abstract class Builder<T extends DataType = any, TNull = any> {

if (typeIds = _typeIds?.flush(length)) { // Unions, DenseUnions
valueOffsets = _offsets?.flush(length);
} else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, Utf8, LargeUtf8), and Lists
} else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, LargeBinary, Utf8, LargeUtf8), and Lists
data = _values?.flush(_offsets.last());
} else { // Fixed-width primitives (Int, Float, Decimal, Time, Timestamp, Duration and Interval)
data = _values?.flush(length);
Expand Down Expand Up @@ -352,7 +352,7 @@ export abstract class FixedWidthBuilder<T extends Int | Float | FixedSizeBinary
}

/** @ignore */
export abstract class VariableWidthBuilder<T extends Binary | Utf8 | LargeUtf8 | List | Map_, TNull = any> extends Builder<T, TNull> {
export abstract class VariableWidthBuilder<T extends Binary | LargeBinary | Utf8 | LargeUtf8 | List | Map_, TNull = any> extends Builder<T, TNull> {
protected _pendingLength = 0;
protected _offsets: OffsetsBufferBuilder<T>;
protected _pending: Map<number, any> | undefined;
Expand Down
54 changes: 54 additions & 0 deletions js/src/builder/largebinary.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

import { LargeBinary } from '../type.js';
import { toUint8Array } from '../util/buffer.js';
import { BufferBuilder } from './buffer.js';
import { VariableWidthBuilder, BuilderOptions } from '../builder.js';

/** @ignore */
export class LargeBinaryBuilder<TNull = any> extends VariableWidthBuilder<LargeBinary, TNull> {
constructor(opts: BuilderOptions<LargeBinary, TNull>) {
super(opts);
this._values = new BufferBuilder(new Uint8Array(0));
}
public get byteLength(): number {
let size = this._pendingLength + (this.length * 4);
this._offsets && (size += this._offsets.byteLength);
this._values && (size += this._values.byteLength);
this._nulls && (size += this._nulls.byteLength);
return size;
}
public setValue(index: number, value: Uint8Array) {
return super.setValue(index, toUint8Array(value));
}
protected _flushPending(pending: Map<number, Uint8Array | undefined>, pendingLength: number) {
const offsets = this._offsets;
const data = this._values.reserve(pendingLength).buffer;
let offset = 0;
for (const [index, value] of pending) {
if (value === undefined) {
offsets.set(index, BigInt(0));
} else {
const length = value.length;
data.set(value, offset);
offsets.set(index, BigInt(length));
offset += length;
}
}
}
}
22 changes: 4 additions & 18 deletions js/src/builder/largeutf8.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import { LargeUtf8 } from '../type.js';
import { encodeUtf8 } from '../util/utf8.js';
import { BufferBuilder } from './buffer.js';
import { VariableWidthBuilder, BuilderOptions } from '../builder.js';
import { LargeBinaryBuilder } from './largebinary.js';

/** @ignore */
export class LargeUtf8Builder<TNull = any> extends VariableWidthBuilder<LargeUtf8, TNull> {
Expand All @@ -36,24 +37,9 @@ export class LargeUtf8Builder<TNull = any> extends VariableWidthBuilder<LargeUtf
public setValue(index: number, value: string) {
return super.setValue(index, encodeUtf8(value) as any);
}

// @ts-ignore
// TODO: move to largeBinaryBuilder when implemented
// protected _flushPending(pending: Map<number, Uint8Array | undefined>, pendingLength: number): void { }
protected _flushPending(pending: Map<number, Uint8Array | undefined>, pendingLength: number) {
const offsets = this._offsets;
const data = this._values.reserve(pendingLength).buffer;
let offset = 0;
for (const [index, value] of pending) {
if (value === undefined) {
offsets.set(index, BigInt(0));
} else {
const length = value.length;
data.set(value, offset);
offsets.set(index, BigInt(length));
offset += length;
}
}
}
protected _flushPending(pending: Map<number, Uint8Array | undefined>, pendingLength: number): void { }
}

// (LargeUtf8Builder.prototype as any)._flushPending = (LargeBinaryBuilder.prototype as any)._flushPending;
(LargeUtf8Builder.prototype as any)._flushPending = (LargeBinaryBuilder.prototype as any)._flushPending;
15 changes: 13 additions & 2 deletions js/src/data.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import { Vector } from './vector.js';
import { BufferType, Type, UnionMode } from './enum.js';
import { DataType, LargeUtf8, strideForType } from './type.js';
import { DataType, strideForType } from './type.js';
import { popcnt_bit_range, truncateBitmap } from './util/bit.js';

// When slicing, we do not know the null count of the sliced range without
Expand Down Expand Up @@ -253,7 +253,7 @@ export class Data<T extends DataType = DataType> {

import {
Dictionary,
Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct,
Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct,
Float,
Int,
Date_,
Expand Down Expand Up @@ -324,6 +324,14 @@ class MakeDataVisitor extends Visitor {
const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props;
return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]);
}
public visitLargeBinary<T extends LargeBinary>(props: LargeBinaryDataProps<T>) {
const { ['type']: type, ['offset']: offset = 0 } = props;
const data = toUint8Array(props['data']);
const nullBitmap = toUint8Array(props['nullBitmap']);
const valueOffsets = toBigInt64Array(props['valueOffsets']);
const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props;
return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]);
}
public visitFixedSizeBinary<T extends FixedSizeBinary>(props: FixedSizeBinaryDataProps<T>) {
const { ['type']: type, ['offset']: offset = 0 } = props;
const nullBitmap = toUint8Array(props['nullBitmap']);
Expand Down Expand Up @@ -444,6 +452,7 @@ interface IntervalDataProps<T extends Interval> extends DataProps_<T> { data?: D
interface DurationDataProps<T extends Duration> extends DataProps_<T> { data?: DataBuffer<T> }
interface FixedSizeBinaryDataProps<T extends FixedSizeBinary> extends DataProps_<T> { data?: DataBuffer<T> }
interface BinaryDataProps<T extends Binary> extends DataProps_<T> { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer<T> }
interface LargeBinaryDataProps<T extends LargeBinary> extends DataProps_<T> { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer<T> }
interface Utf8DataProps<T extends Utf8> extends DataProps_<T> { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer<T> }
interface LargeUtf8DataProps<T extends LargeUtf8> extends DataProps_<T> { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer<T> }
interface ListDataProps<T extends List> extends DataProps_<T> { valueOffsets: ValueOffsetsBuffer; child: Data<T['valueType']> }
Expand All @@ -468,6 +477,7 @@ export type DataProps<T extends DataType> = (
T extends Duration /* */ ? DurationDataProps<T> :
T extends FixedSizeBinary /* */ ? FixedSizeBinaryDataProps<T> :
T extends Binary /* */ ? BinaryDataProps<T> :
T extends LargeBinary /* */ ? LargeBinaryDataProps<T> :
T extends Utf8 /* */ ? Utf8DataProps<T> :
T extends LargeUtf8 /* */ ? LargeUtf8DataProps<T> :
T extends List /* */ ? ListDataProps<T> :
Expand Down Expand Up @@ -495,6 +505,7 @@ export function makeData<T extends Interval>(props: IntervalDataProps<T>): Data<
export function makeData<T extends Duration>(props: DurationDataProps<T>): Data<T>;
export function makeData<T extends FixedSizeBinary>(props: FixedSizeBinaryDataProps<T>): Data<T>;
export function makeData<T extends Binary>(props: BinaryDataProps<T>): Data<T>;
export function makeData<T extends LargeBinary>(props: LargeBinaryDataProps<T>): Data<T>;
export function makeData<T extends Utf8>(props: Utf8DataProps<T>): Data<T>;
export function makeData<T extends LargeUtf8>(props: LargeUtf8DataProps<T>): Data<T>;
export function makeData<T extends List>(props: ListDataProps<T>): Data<T>;
Expand Down
3 changes: 2 additions & 1 deletion js/src/enum.ts
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,8 @@ export enum Type {
FixedSizeBinary = 15, /** Fixed-size binary. Each value occupies the same number of bytes */
FixedSizeList = 16, /** Fixed-size list. Each value occupies the same number of bytes */
Map = 17, /** Map of named logical types */
Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. */
Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds */
LargeBinary = 19, /** Large variable-length bytes (no guarantee of UTF8-ness) */
LargeUtf8 = 20, /** Large variable-length string as List<Char> */

Dictionary = -1, /** Dictionary aka Category type */
Expand Down
4 changes: 4 additions & 0 deletions js/src/interfaces.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import type { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder
import type { Utf8Builder } from './builder/utf8.js';
import type { LargeUtf8Builder } from './builder/largeutf8.js';
import type { BinaryBuilder } from './builder/binary.js';
import type { LargeBinaryBuilder } from './builder/largebinary.js';
import type { ListBuilder } from './builder/list.js';
import type { FixedSizeListBuilder } from './builder/fixedsizelist.js';
import type { MapBuilder } from './builder/map.js';
Expand Down Expand Up @@ -210,6 +211,7 @@ export type TypeToDataType<T extends Type> = {
[Type.Utf8]: type.Utf8;
[Type.LargeUtf8]: type.LargeUtf8;
[Type.Binary]: type.Binary;
[Type.LargeBinary]: type.LargeBinary;
[Type.FixedSizeBinary]: type.FixedSizeBinary;
[Type.Date]: type.Date_;
[Type.DateDay]: type.DateDay;
Expand Down Expand Up @@ -264,6 +266,7 @@ type TypeToBuilder<T extends Type = any, TNull = any> = {
[Type.Utf8]: Utf8Builder<TNull>;
[Type.LargeUtf8]: LargeUtf8Builder<TNull>;
[Type.Binary]: BinaryBuilder<TNull>;
[Type.LargeBinary]: LargeBinaryBuilder<TNull>;
[Type.FixedSizeBinary]: FixedSizeBinaryBuilder<TNull>;
[Type.Date]: DateBuilder<any, TNull>;
[Type.DateDay]: DateDayBuilder<TNull>;
Expand Down Expand Up @@ -318,6 +321,7 @@ type DataTypeToBuilder<T extends DataType = any, TNull = any> = {
[Type.Utf8]: T extends type.Utf8 ? Utf8Builder<TNull> : never;
[Type.LargeUtf8]: T extends type.LargeUtf8 ? LargeUtf8Builder<TNull> : never;
[Type.Binary]: T extends type.Binary ? BinaryBuilder<TNull> : never;
[Type.LargeBinary]: T extends type.LargeBinary ? LargeBinaryBuilder<TNull> : never;
[Type.FixedSizeBinary]: T extends type.FixedSizeBinary ? FixedSizeBinaryBuilder<TNull> : never;
[Type.Date]: T extends type.Date_ ? DateBuilder<T, TNull> : never;
[Type.DateDay]: T extends type.DateDay ? DateDayBuilder<TNull> : never;
Expand Down
3 changes: 2 additions & 1 deletion js/src/ipc/metadata/json.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import { Schema, Field } from '../../schema.js';
import {
DataType, Dictionary, TimeBitWidth,
Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary,
Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary,
List, FixedSizeList, Map_, Struct, Union,
Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration,
} from '../../type.js';
Expand Down Expand Up @@ -149,6 +149,7 @@ function typeFromJSON(f: any, children?: Field[]): DataType<any> {
case 'NONE': return new Null();
case 'null': return new Null();
case 'binary': return new Binary();
case 'largebinary': return new LargeBinary();
case 'utf8': return new Utf8();
case 'largeutf8': return new LargeUtf8();
case 'bool': return new Bool();
Expand Down
3 changes: 2 additions & 1 deletion js/src/ipc/metadata/message.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ import ByteBuffer = flatbuffers.ByteBuffer;

import {
DataType, Dictionary, TimeBitWidth,
Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary,
Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary,
List, FixedSizeList, Map_, Struct, Union,
Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration,
} from '../../type.js';
Expand Down Expand Up @@ -432,6 +432,7 @@ function decodeFieldType(f: _Field, children?: Field[]): DataType<any> {
case Type['NONE']: return new Null();
case Type['Null']: return new Null();
case Type['Binary']: return new Binary();
case Type['LargeBinary']: return new LargeBinary();
case Type['Utf8']: return new Utf8();
case Type['LargeUtf8']: return new LargeUtf8();
case Type['Bool']: return new Bool();
Expand Down
5 changes: 2 additions & 3 deletions js/src/ipc/writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ export class RecordBatchJSONWriter<T extends TypeMap = any> extends RecordBatchW
protected _writeDictionaryBatch(dictionary: Data, id: number, isDelta = false) {
this._dictionaryDeltaOffsets.set(id, dictionary.length + (this._dictionaryDeltaOffsets.get(id) || 0));
this._write(this._dictionaryBlocks.length === 0 ? ` ` : `,\n `);
this._write(`${dictionaryBatchToJSON(dictionary, id, isDelta)}`);
this._write(dictionaryBatchToJSON(dictionary, id, isDelta));
this._dictionaryBlocks.push(new FileBlock(0, 0, 0));
return this;
}
Expand All @@ -401,7 +401,6 @@ export class RecordBatchJSONWriter<T extends TypeMap = any> extends RecordBatchW
return this;
}
public close() {

if (this._dictionaries.length > 0) {
this._write(`,\n "dictionaries": [\n`);
for (const batch of this._dictionaries) {
Expand All @@ -413,7 +412,7 @@ export class RecordBatchJSONWriter<T extends TypeMap = any> extends RecordBatchW
if (this._recordBatches.length > 0) {
for (let i = -1, n = this._recordBatches.length; ++i < n;) {
this._write(i === 0 ? `,\n "batches": [\n ` : `,\n `);
this._write(`${recordBatchToJSON(this._recordBatches[i])}`);
this._write(recordBatchToJSON(this._recordBatches[i]));
this._recordBatchBlocks.push(new FileBlock(0, 0, 0));
}
this._write(`\n ]`);
Expand Down
18 changes: 17 additions & 1 deletion js/src/type.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ export abstract class DataType<TType extends Type = Type, TChildren extends Type
/** @nocollapse */ static isInt(x: any): x is Int_ { return x?.typeId === Type.Int; }
/** @nocollapse */ static isFloat(x: any): x is Float { return x?.typeId === Type.Float; }
/** @nocollapse */ static isBinary(x: any): x is Binary { return x?.typeId === Type.Binary; }
/** @nocollapse */ static isLargeBinary(x: any): x is LargeBinary { return x?.typeId === Type.LargeBinary; }
/** @nocollapse */ static isUtf8(x: any): x is Utf8 { return x?.typeId === Type.Utf8; }
/** @nocollapse */ static isLargeUtf8(x: any): x is LargeUtf8 { return x?.typeId === Type.LargeUtf8; }
/** @nocollapse */ static isBool(x: any): x is Bool { return x?.typeId === Type.Bool; }
Expand Down Expand Up @@ -250,6 +251,22 @@ export class Binary extends DataType<Type.Binary> {
})(Binary.prototype);
}

/** @ignore */
export interface LargeBinary extends DataType<Type.LargeBinary> { TArray: Uint8Array; TOffsetArray: BigInt64Array; TValue: Uint8Array; ArrayType: TypedArrayConstructor<Uint8Array>; OffsetArrayType: BigIntArrayConstructor<BigInt64Array> }
/** @ignore */
export class LargeBinary extends DataType<Type.LargeBinary> {
constructor() {
super();
}
public get typeId() { return Type.LargeBinary as Type.LargeBinary; }
public toString() { return `LargeBinary`; }
protected static [Symbol.toStringTag] = ((proto: LargeBinary) => {
(<any>proto).ArrayType = Uint8Array;
(<any>proto).OffsetArrayType = BigInt64Array;
return proto[Symbol.toStringTag] = 'LargeBinary';
})(LargeBinary.prototype);
}

/** @ignore */
export interface Utf8 extends DataType<Type.Utf8> { TArray: Uint8Array; TOffsetArray: Int32Array; TValue: string; ArrayType: TypedArrayConstructor<Uint8Array>; OffsetArrayType: TypedArrayConstructor<Int32Array> }
/** @ignore */
Expand Down Expand Up @@ -601,7 +618,6 @@ export class FixedSizeBinary extends DataType<Type.FixedSizeBinary> {
protected static [Symbol.toStringTag] = ((proto: FixedSizeBinary) => {
(<any>proto).byteWidth = null;
(<any>proto).ArrayType = Uint8Array;
(<any>proto).OffsetArrayType = Int32Array;
return proto[Symbol.toStringTag] = 'FixedSizeBinary';
})(FixedSizeBinary.prototype);
}
Expand Down
Loading

0 comments on commit 4ec6544

Please sign in to comment.