-
Notifications
You must be signed in to change notification settings - Fork 46
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
First stage of optimizing in-memory string column representation: des…
…erialize string column protobuf msg into packed byte array with offsets.
- Loading branch information
Tobin Baker
committed
Aug 12, 2015
1 parent
2d53469
commit 9270170
Showing
2 changed files
with
67 additions
and
9 deletions.
There are no files selected for viewing
60 changes: 60 additions & 0 deletions
60
src/edu/washington/escience/myria/column/StringPackedColumn.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
package edu.washington.escience.myria.column; | ||
|
||
import java.nio.ByteBuffer; | ||
import java.nio.charset.StandardCharsets; | ||
|
||
import com.google.common.base.Preconditions; | ||
|
||
/** | ||
* A column of String values, packed into a UTF-8 encoded byte array. | ||
* | ||
* | ||
*/ | ||
public final class StringPackedColumn extends StringColumn { | ||
/** Required for Java serialization. */ | ||
private static final long serialVersionUID = 1L; | ||
/** A read-only buffer containing the packed UTF-8 character data. */ | ||
private final ByteBuffer data; | ||
/** Contains the number of bytes in data. */ | ||
private final int numBytes; | ||
/** Contains the offset of each string in order. */ | ||
private final int[] offsets; | ||
|
||
/** | ||
* Constructs a new column. | ||
* | ||
* @param data the data | ||
* @param offsets offsets of strings within data column | ||
* */ | ||
public StringPackedColumn(final ByteBuffer data, final int numBytes, final int[] offsets) { | ||
this.data = data; | ||
this.numBytes = numBytes; | ||
this.offsets = offsets; | ||
} | ||
|
||
/** | ||
* Returns the element at the specified row in this column. | ||
* | ||
* @param row row of element to return. | ||
* @return the element at the specified row in this column. | ||
*/ | ||
@Override | ||
public String getString(final int row) { | ||
Preconditions.checkElementIndex(row, size()); | ||
int len; | ||
if (row == offsets.length - 1) { | ||
len = numBytes - offsets[row]; | ||
} else { | ||
len = offsets[row + 1] - offsets[row]; | ||
} | ||
byte[] strBytes = new byte[len]; | ||
data.position(offsets[row]); | ||
data.get(strBytes, 0, len); | ||
return new String(strBytes, StandardCharsets.UTF_8); | ||
} | ||
|
||
@Override | ||
public int size() { | ||
return offsets.length; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,20 @@ | ||
package edu.washington.escience.myria.column.builder; | ||
|
||
import java.nio.BufferOverflowException; | ||
import java.nio.ByteBuffer; | ||
import java.sql.ResultSet; | ||
import java.sql.SQLException; | ||
import java.util.List; | ||
import java.util.Objects; | ||
|
||
import com.almworks.sqlite4java.SQLiteException; | ||
import com.almworks.sqlite4java.SQLiteStatement; | ||
import com.google.common.base.Preconditions; | ||
import com.google.common.primitives.Ints; | ||
|
||
import edu.washington.escience.myria.Type; | ||
import edu.washington.escience.myria.column.StringArrayColumn; | ||
import edu.washington.escience.myria.column.StringColumn; | ||
import edu.washington.escience.myria.column.StringPackedColumn; | ||
import edu.washington.escience.myria.column.mutable.StringMutableColumn; | ||
import edu.washington.escience.myria.proto.DataProto.ColumnMessage; | ||
import edu.washington.escience.myria.proto.DataProto.StringColumnMessage; | ||
|
@@ -66,14 +68,10 @@ public static StringColumn buildFromProtobuf(final ColumnMessage message, final | |
"Trying to construct StringColumn from non-STRING ColumnMessage %s", message.getType()); | ||
Preconditions.checkArgument(message.hasStringColumn(), "ColumnMessage has type STRING but no StringColumn"); | ||
final StringColumnMessage stringColumn = message.getStringColumn(); | ||
List<Integer> startIndices = stringColumn.getStartIndicesList(); | ||
List<Integer> endIndices = stringColumn.getEndIndicesList(); | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
senderista
Contributor
|
||
String[] newData = new String[numTuples]; | ||
String allStrings = stringColumn.getData().toStringUtf8(); | ||
for (int i = 0; i < numTuples; i++) { | ||
newData[i] = allStrings.substring(startIndices.get(i), endIndices.get(i)); | ||
} | ||
return new StringColumnBuilder(newData, numTuples).build(); | ||
int numBytes = stringColumn.getData().size(); | ||
ByteBuffer data = stringColumn.getData().asReadOnlyByteBuffer(); | ||
int[] offsets = Ints.toArray(stringColumn.getStartIndicesList()); | ||
return new StringPackedColumn(data, numBytes, offsets); | ||
} | ||
|
||
@Override | ||
|
Looks like the
end_indices
field incolumn.proto
is not used anymore after this change..?