Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: String hashing, creation and concatenating #560

Merged
merged 4 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 36 additions & 14 deletions nova_vm/src/ecmascript/types/language/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ use crate::{
};

pub use data::StringHeapData;
use wtf8::Wtf8Buf;

#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(transparent)]
Expand Down Expand Up @@ -322,11 +323,32 @@ impl<'a> String<'a> {
Empty,
ExistingString(HeapString<'a>),
SmallString { data: [u8; 7], len: usize },
String(std::string::String),
String(Wtf8Buf),
}
let strings = strings.as_ref();
let mut status = if strings.len() > 1 {
let len = strings.iter().fold(0usize, |a, s| a + s.len(agent));
if len > 7 {
Status::String(Wtf8Buf::with_capacity(len))
} else {
Status::Empty
}
} else {
Status::Empty
};

fn push_string_to_wtf8(agent: &Agent, buf: &mut Wtf8Buf, string: String) {
match string {
String::String(heap_string) => {
buf.push_wtf8(agent[heap_string].as_wtf8());
}
String::SmallString(small_string) => {
buf.push_str(small_string.as_str());
}
}
}
let mut status = Status::Empty;

for string in strings.as_ref() {
for string in strings {
if string.is_empty_string() {
continue;
}
Expand All @@ -341,11 +363,12 @@ impl<'a> String<'a> {
String::String(idx) => Status::ExistingString(*idx),
};
}
Status::ExistingString(idx) => {
Status::ExistingString(heap_string) => {
let heap_string = *heap_string;
let mut result =
std::string::String::with_capacity(agent[*idx].len() + string.len(agent));
result.push_str(agent[*idx].as_str());
result.push_str(string.as_str(agent));
Wtf8Buf::with_capacity(agent[heap_string].len() + string.len(agent));
result.push_wtf8(agent[heap_string].as_wtf8());
push_string_to_wtf8(agent, &mut result, *string);
status = Status::String(result)
}
Status::SmallString { data, len } => {
Expand All @@ -358,15 +381,15 @@ impl<'a> String<'a> {
.copy_from_slice(&smstr.data()[..string_len]);
*len += string_len;
} else {
let mut result = std::string::String::with_capacity(*len + string_len);
let mut result = Wtf8Buf::with_capacity(*len + string_len);
// SAFETY: Since SmallStrings are guaranteed UTF-8, `&data[..len]` is the result
// of concatenating UTF-8 strings, which is always valid UTF-8.
result.push_str(unsafe { std::str::from_utf8_unchecked(&data[..*len]) });
result.push_str(string.as_str(agent));
push_string_to_wtf8(agent, &mut result, *string);
status = Status::String(result);
}
}
Status::String(buffer) => buffer.push_str(string.as_str(agent)),
Status::String(buffer) => push_string_to_wtf8(agent, buffer, *string),
}
}

Expand All @@ -379,7 +402,7 @@ impl<'a> String<'a> {
let str_slice = unsafe { std::str::from_utf8_unchecked(&data[..len]) };
SmallString::from_str_unchecked(str_slice).into()
}
Status::String(string) => agent.heap.create(string).bind(gc),
Status::String(string) => agent.heap.create(string.into_string().unwrap()).bind(gc),
}
}

Expand Down Expand Up @@ -559,9 +582,8 @@ impl Scoped<'_, String<'static>> {
}
}

impl CreateHeapData<StringHeapData, String<'static>> for Heap {
fn create(&mut self, data: StringHeapData) -> String<'static> {
let hash = self.string_hasher.hash_one(data.as_str());
impl CreateHeapData<(StringHeapData, u64), String<'static>> for Heap {
fn create(&mut self, (data, hash): (StringHeapData, u64)) -> String<'static> {
self.strings.push(Some(data));
let index = StringIndex::last(&self.strings);
let heap_string = HeapString(index);
Expand Down
25 changes: 23 additions & 2 deletions nova_vm/src/ecmascript/types/language/string/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::{cell::OnceCell, num::NonZeroUsize};
use std::{cell::OnceCell, hash::Hash, num::NonZeroUsize};

use wtf8::{Wtf8, Wtf8Buf};

Expand All @@ -24,7 +24,12 @@ impl PartialEq for StringHeapData {
return true;
}
}
self.as_str() == other.as_str()
match (&self.data, &other.data) {
(StringBuffer::Owned(a), StringBuffer::Owned(b)) => a == b,
(StringBuffer::Owned(a), StringBuffer::Static(b)) => a == b,
(StringBuffer::Static(a), StringBuffer::Owned(b)) => a == b,
(StringBuffer::Static(a), StringBuffer::Static(b)) => a == b,
}
}
}
impl Eq for StringHeapData {}
Expand All @@ -47,6 +52,15 @@ pub(crate) enum StringBuffer {
Static(&'static Wtf8),
}

impl Hash for StringBuffer {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
match self {
StringBuffer::Owned(wtf8_buf) => wtf8_buf.hash(state),
StringBuffer::Static(wtf8) => wtf8.hash(state),
}
}
}

impl StringHeapData {
/// The maximum UTf-16 length of a JS string, according to the spec (2^53 - 1).
const MAX_UTF16_LENGTH: usize = (1 << 53) - 1;
Expand Down Expand Up @@ -213,6 +227,13 @@ impl StringHeapData {
}
}

pub fn as_wtf8(&self) -> &Wtf8 {
match &self.data {
StringBuffer::Owned(buf) => buf,
StringBuffer::Static(buf) => buf,
}
}

pub fn from_str(str: &str) -> Self {
debug_assert!(str.len() > 7);
assert!(str.len() <= Self::MAX_UTF8_LENGTH, "String is too long.");
Expand Down
6 changes: 4 additions & 2 deletions nova_vm/src/ecmascript/types/language/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use wtf8::Wtf8;

use super::{
bigint::{HeapBigInt, SmallBigInt},
number::HeapNumber,
Expand Down Expand Up @@ -640,10 +642,10 @@ impl Value {
}
Value::String(data) => {
// Skip discriminant hashing in strings
arena[data].as_str().hash(hasher);
arena[data].data.hash(hasher);
}
Value::SmallString(data) => {
data.as_str().hash(hasher);
Wtf8::from_str(data.as_str()).hash(hasher);
}
Value::Symbol(data) => {
discriminant.hash(hasher);
Expand Down
52 changes: 34 additions & 18 deletions nova_vm/src/engine/bytecode/executable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -265,30 +265,46 @@ impl Executable {
}

pub(super) fn get_instruction(instructions: &[u8], ip: &mut usize) -> Option<Instr> {
if *ip >= instructions.len() {
let len = instructions.len();
let cur_ip = *ip;
if cur_ip >= len {
return None;
}

let kind: Instruction = unsafe { std::mem::transmute::<u8, Instruction>(instructions[*ip]) };
*ip += 1;

let mut args: [Option<IndexType>; 2] = [None, None];

for item in args.iter_mut().take(kind.argument_count() as usize) {
let length = instructions[*ip..].len();
if length >= 2 {
let bytes = IndexType::from_ne_bytes(unsafe {
*std::mem::transmute::<*const u8, *const [u8; 2]>(instructions[*ip..].as_ptr())
});
let kind: Instruction = unsafe { std::mem::transmute::<u8, Instruction>(instructions[cur_ip]) };

let arg_count = kind.argument_count() as usize;

let cur_ip = *ip;
match arg_count {
0 => Some(Instr {
kind,
args: [None, None],
}),
1 => {
let bytes: [u8; 2] = [instructions[cur_ip], instructions[cur_ip + 1]];
let arg0 = IndexType::from_ne_bytes(bytes);
*ip += 2;
*item = Some(bytes);
} else {
*ip += 1;
*item = None;
Some(Instr {
kind,
args: [Some(arg0), None],
})
}
2 => {
let bytes: [[u8; 2]; 2] = [
[instructions[cur_ip], instructions[cur_ip + 1]],
[instructions[cur_ip + 2], instructions[cur_ip + 3]],
];
let arg0 = IndexType::from_ne_bytes(bytes[0]);
let arg1 = IndexType::from_ne_bytes(bytes[1]);
*ip += 4;
Some(Instr {
kind,
args: [Some(arg0), Some(arg1)],
})
}
_ => unreachable!(),
}

Some(Instr { kind, args })
}

impl Index<Executable> for Agent {
Expand Down
6 changes: 5 additions & 1 deletion nova_vm/src/engine/bytecode/vm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ use crate::{
heap::{CompactionLists, HeapMarkAndSweep, WellKnownSymbolIndexes, WorkQueues},
};

use super::executable::get_instruction;

struct EmptyParametersList(ast::FormalParameters<'static>);
unsafe impl Send for EmptyParametersList {}
unsafe impl Sync for EmptyParametersList {}
Expand Down Expand Up @@ -289,7 +291,9 @@ impl<'a> Vm {
let do_gc = !agent.options.disable_gc;
#[cfg(feature = "interleaved-gc")]
let mut instr_count = 0u8;
while let Some(instr) = executable.get_instruction(agent, &mut self.ip) {

let instructions = executable.get_instructions(agent);
while let Some(instr) = get_instruction(instructions, &mut self.ip) {
#[cfg(feature = "interleaved-gc")]
if do_gc {
instr_count = instr_count.wrapping_add(1);
Expand Down
44 changes: 27 additions & 17 deletions nova_vm/src/heap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ use crate::{
use ahash::AHashMap;
use hashbrown::HashTable;
pub(crate) use heap_bits::{CompactionLists, HeapMarkAndSweep, WorkQueues};
use indexes::IntoBaseIndex;
use wtf8::Wtf8;

#[derive(Debug)]
pub struct Heap {
Expand Down Expand Up @@ -322,11 +322,13 @@ impl Heap {
/// guaranteed to never equal true.
pub(crate) unsafe fn alloc_str(&mut self, message: &str) -> String<'static> {
let found = self.find_equal_string(message);
if let Some(idx) = found {
return idx;
match found {
Ok(string) => string,
Err(hash) => {
let data = StringHeapData::from_str(message);
self.create((data, hash))
}
}
let data = StringHeapData::from_str(message);
self.create(data)
}

/// Allocate a static string onto the Agent heap
Expand All @@ -343,11 +345,13 @@ impl Heap {
/// guaranteed to never equal true.
unsafe fn alloc_string(&mut self, message: std::string::String) -> String<'static> {
let found = self.find_equal_string(message.as_str());
if let Some(idx) = found {
return idx;
match found {
Ok(string) => string,
Err(hash) => {
let data = StringHeapData::from_string(message);
self.create((data, hash))
}
}
let data = StringHeapData::from_string(message);
self.create(data)
}

/// Allocate a static string onto the Agent heap
Expand All @@ -364,24 +368,30 @@ impl Heap {
/// guaranteed to never equal true.
pub(crate) unsafe fn alloc_static_str(&mut self, message: &'static str) -> String<'static> {
let found = self.find_equal_string(message);
if let Some(idx) = found {
return idx;
match found {
Ok(string) => string,
Err(hash) => {
let data = StringHeapData::from_static_str(message);
self.create((data, hash))
}
}
let data = StringHeapData::from_static_str(message);
self.create(data)
}

fn find_equal_string(&self, message: &str) -> Option<String<'static>> {
/// Find existing heap String or return the strings hash.
fn find_equal_string(&self, message: &str) -> Result<String<'static>, u64> {
debug_assert!(message.len() > 7);
let message = Wtf8::from_str(message);
let hash = self.string_hasher.hash_one(message);
self.string_lookup_table
.find(hash, |heap_string| {
let heap_str = self.strings[heap_string.into_base_index().into_index()]
let heap_str = self.strings[heap_string.get_index()]
.as_ref()
.map(|string| string.as_str());
heap_str == Some(message)
.unwrap()
.as_wtf8();
heap_str == message
})
.map(|&heap_string| heap_string.into())
.ok_or(hash)
}

/// Allocate a 64-bit floating point number onto the Agent heap
Expand Down