diff --git a/src/edu/washington/escience/myria/column/StringPackedColumn.java b/src/edu/washington/escience/myria/column/StringPackedColumn.java new file mode 100644 index 000000000..0b3c49bb5 --- /dev/null +++ b/src/edu/washington/escience/myria/column/StringPackedColumn.java @@ -0,0 +1,60 @@ +package edu.washington.escience.myria.column; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; + +import com.google.common.base.Preconditions; + +/** + * A column of String values, packed into a UTF-8 encoded byte array. + * + * + */ +public final class StringPackedColumn extends StringColumn { + /** Required for Java serialization. */ + private static final long serialVersionUID = 1L; + /** A read-only buffer containing the packed UTF-8 character data. */ + private final ByteBuffer data; + /** Contains the number of bytes in data. */ + private final int numBytes; + /** Contains the offset of each string in order. */ + private final int[] offsets; + + /** + * Constructs a new column. + * + * @param data the data + * @param offsets offsets of strings within data column + * */ + public StringPackedColumn(final ByteBuffer data, final int numBytes, final int[] offsets) { + this.data = data; + this.numBytes = numBytes; + this.offsets = offsets; + } + + /** + * Returns the element at the specified row in this column. + * + * @param row row of element to return. + * @return the element at the specified row in this column. + */ + @Override + public String getString(final int row) { + Preconditions.checkElementIndex(row, size()); + int len; + if (row == offsets.length - 1) { + len = numBytes - offsets[row]; + } else { + len = offsets[row + 1] - offsets[row]; + } + byte[] strBytes = new byte[len]; + data.position(offsets[row]); + data.get(strBytes, 0, len); + return new String(strBytes, StandardCharsets.UTF_8); + } + + @Override + public int size() { + return offsets.length; + } +} diff --git a/src/edu/washington/escience/myria/column/builder/StringColumnBuilder.java b/src/edu/washington/escience/myria/column/builder/StringColumnBuilder.java index 77f635b2d..a8f04477e 100644 --- a/src/edu/washington/escience/myria/column/builder/StringColumnBuilder.java +++ b/src/edu/washington/escience/myria/column/builder/StringColumnBuilder.java @@ -1,18 +1,20 @@ package edu.washington.escience.myria.column.builder; import java.nio.BufferOverflowException; +import java.nio.ByteBuffer; import java.sql.ResultSet; import java.sql.SQLException; -import java.util.List; import java.util.Objects; import com.almworks.sqlite4java.SQLiteException; import com.almworks.sqlite4java.SQLiteStatement; import com.google.common.base.Preconditions; +import com.google.common.primitives.Ints; import edu.washington.escience.myria.Type; import edu.washington.escience.myria.column.StringArrayColumn; import edu.washington.escience.myria.column.StringColumn; +import edu.washington.escience.myria.column.StringPackedColumn; import edu.washington.escience.myria.column.mutable.StringMutableColumn; import edu.washington.escience.myria.proto.DataProto.ColumnMessage; import edu.washington.escience.myria.proto.DataProto.StringColumnMessage; @@ -66,14 +68,10 @@ public static StringColumn buildFromProtobuf(final ColumnMessage message, final "Trying to construct StringColumn from non-STRING ColumnMessage %s", message.getType()); Preconditions.checkArgument(message.hasStringColumn(), "ColumnMessage has type STRING but no StringColumn"); final StringColumnMessage stringColumn = message.getStringColumn(); - List startIndices = stringColumn.getStartIndicesList(); - List endIndices = stringColumn.getEndIndicesList(); - String[] newData = new String[numTuples]; - String allStrings = stringColumn.getData().toStringUtf8(); - for (int i = 0; i < numTuples; i++) { - newData[i] = allStrings.substring(startIndices.get(i), endIndices.get(i)); - } - return new StringColumnBuilder(newData, numTuples).build(); + int numBytes = stringColumn.getData().size(); + ByteBuffer data = stringColumn.getData().asReadOnlyByteBuffer(); + int[] offsets = Ints.toArray(stringColumn.getStartIndicesList()); + return new StringPackedColumn(data, numBytes, offsets); } @Override