Skip to content

Commit

Permalink
First stage of optimizing in-memory string column representation: des…
Browse files Browse the repository at this point in the history
…erialize string column protobuf msg into packed byte array with offsets.
  • Loading branch information
Tobin Baker committed Aug 12, 2015
1 parent 2d53469 commit 9270170
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 9 deletions.
60 changes: 60 additions & 0 deletions src/edu/washington/escience/myria/column/StringPackedColumn.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package edu.washington.escience.myria.column;

import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;

import com.google.common.base.Preconditions;

/**
* A column of String values, packed into a UTF-8 encoded byte array.
*
*
*/
public final class StringPackedColumn extends StringColumn {
/** Required for Java serialization. */
private static final long serialVersionUID = 1L;
/** A read-only buffer containing the packed UTF-8 character data. */
private final ByteBuffer data;
/** Contains the number of bytes in data. */
private final int numBytes;
/** Contains the offset of each string in order. */
private final int[] offsets;

/**
* Constructs a new column.
*
* @param data the data
* @param offsets offsets of strings within data column
* */
public StringPackedColumn(final ByteBuffer data, final int numBytes, final int[] offsets) {
this.data = data;
this.numBytes = numBytes;
this.offsets = offsets;
}

/**
* Returns the element at the specified row in this column.
*
* @param row row of element to return.
* @return the element at the specified row in this column.
*/
@Override
public String getString(final int row) {
Preconditions.checkElementIndex(row, size());
int len;
if (row == offsets.length - 1) {
len = numBytes - offsets[row];
} else {
len = offsets[row + 1] - offsets[row];
}
byte[] strBytes = new byte[len];
data.position(offsets[row]);
data.get(strBytes, 0, len);
return new String(strBytes, StandardCharsets.UTF_8);
}

@Override
public int size() {
return offsets.length;
}
}
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
package edu.washington.escience.myria.column.builder;

import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.List;
import java.util.Objects;

import com.almworks.sqlite4java.SQLiteException;
import com.almworks.sqlite4java.SQLiteStatement;
import com.google.common.base.Preconditions;
import com.google.common.primitives.Ints;

import edu.washington.escience.myria.Type;
import edu.washington.escience.myria.column.StringArrayColumn;
import edu.washington.escience.myria.column.StringColumn;
import edu.washington.escience.myria.column.StringPackedColumn;
import edu.washington.escience.myria.column.mutable.StringMutableColumn;
import edu.washington.escience.myria.proto.DataProto.ColumnMessage;
import edu.washington.escience.myria.proto.DataProto.StringColumnMessage;
Expand Down Expand Up @@ -66,14 +68,10 @@ public static StringColumn buildFromProtobuf(final ColumnMessage message, final
"Trying to construct StringColumn from non-STRING ColumnMessage %s", message.getType());
Preconditions.checkArgument(message.hasStringColumn(), "ColumnMessage has type STRING but no StringColumn");
final StringColumnMessage stringColumn = message.getStringColumn();
List<Integer> startIndices = stringColumn.getStartIndicesList();
List<Integer> endIndices = stringColumn.getEndIndicesList();

This comment has been minimized.

Copy link
@jingjingwang

jingjingwang Aug 14, 2015

Contributor

Looks like the end_indices field in column.proto is not used anymore after this change..?

This comment has been minimized.

Copy link
@senderista

senderista Aug 14, 2015

Contributor

Yeah, but I'd rather not mess with the protobuf definitions until I get this whole zero-copy story straightened out.

String[] newData = new String[numTuples];
String allStrings = stringColumn.getData().toStringUtf8();
for (int i = 0; i < numTuples; i++) {
newData[i] = allStrings.substring(startIndices.get(i), endIndices.get(i));
}
return new StringColumnBuilder(newData, numTuples).build();
int numBytes = stringColumn.getData().size();
ByteBuffer data = stringColumn.getData().asReadOnlyByteBuffer();
int[] offsets = Ints.toArray(stringColumn.getStartIndicesList());
return new StringPackedColumn(data, numBytes, offsets);
}

@Override
Expand Down

0 comments on commit 9270170

Please sign in to comment.