Skip to content

Commit

Permalink
Merge branch 'master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
ataymano authored Mar 5, 2024
2 parents cd7c422 + 2004e72 commit e52ca46
Show file tree
Hide file tree
Showing 58 changed files with 3,389 additions and 328 deletions.
16 changes: 16 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "(ctest) Launch",
"type": "cppdbg",
"cwd": "${workspaceFolder}",
"request": "launch",
"program": "${cmake.testProgram}",
"args": [ "${cmake.testArgs}" ]
}
]
}
4 changes: 2 additions & 2 deletions ext_libs/ext_libs.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ if(RAPIDJSON_SYS_DEP)
# Since EXACT is not specified, any version compatible with 1.1.0 is accepted (>= 1.1.0)
find_package(RapidJSON 1.1.0 CONFIG REQUIRED)
add_library(RapidJSON INTERFACE)
target_include_directories(RapidJSON INTERFACE ${RapidJSON_INCLUDE_DIRS})
target_include_directories(RapidJSON INTERFACE ${RapidJSON_INCLUDE_DIRS} ${RAPIDJSON_INCLUDE_DIRS})
else()
add_library(RapidJSON INTERFACE)
target_include_directories(RapidJSON SYSTEM INTERFACE "${CMAKE_CURRENT_LIST_DIR}/rapidjson/include")
Expand Down Expand Up @@ -127,4 +127,4 @@ if(VW_FEAT_CB_GRAPH_FEEDBACK)
target_include_directories(mlpack_ensmallen SYSTEM INTERFACE ${CMAKE_CURRENT_LIST_DIR}/armadillo-code/include)

target_include_directories(mlpack_ensmallen SYSTEM INTERFACE ${CMAKE_CURRENT_LIST_DIR}/ensmallen/include)
endif()
endif()
14 changes: 8 additions & 6 deletions python/docs/source/tutorials/DFtoVW_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -802,15 +802,17 @@
"\n",
"# Adding columns for easier visualization\n",
"weights_df[\"feature_name\"] = weights_df.apply(\n",
" lambda row: row.vw_feature_name.split(\"=\")[0]\n",
" if row.is_cat\n",
" else row.vw_feature_name,\n",
" lambda row: (\n",
" row.vw_feature_name.split(\"=\")[0] if row.is_cat else row.vw_feature_name\n",
" ),\n",
" axis=1,\n",
")\n",
"weights_df[\"feature_value\"] = weights_df.apply(\n",
" lambda row: row.vw_feature_name.split(\"=\")[1].zfill(2)\n",
" if row.is_cat\n",
" else row.vw_feature_name,\n",
" lambda row: (\n",
" row.vw_feature_name.split(\"=\")[1].zfill(2)\n",
" if row.is_cat\n",
" else row.vw_feature_name\n",
" ),\n",
" axis=1,\n",
")\n",
"weights_df.sort_values([\"feature_name\", \"feature_value\"], inplace=True)"
Expand Down
2 changes: 1 addition & 1 deletion python/docs/source/tutorials/cmd_first_steps.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,6 @@ The model predicted a value of **0**. This result means our house will not need
## More to explore

- See [Python tutorial](python_first_steps.ipynb) for a quick introduction to the basics of training and testing your model.
- To learn more about how to approach a contextual bandits problem using tVowpal Wabbit — including how to work with different contextual bandits approaches, how to format data, and understand the results — see the [Contextual Bandit Reinforcement Learning Tutorial](python_Contextual_bandits_and_Vowpal_Wabbit.ipynb).
- To learn more about how to approach a contextual bandits problem using Vowpal Wabbit — including how to work with different contextual bandits approaches, how to format data, and understand the results — see the [Contextual Bandit Reinforcement Learning Tutorial](python_Contextual_bandits_and_Vowpal_Wabbit.ipynb).
- For more on the contextual bandits approach to reinforcement learning, including a content personalization scenario, see the [Contextual Bandit Simulation Tutorial](python_Simulating_a_news_personalization_scenario_using_Contextual_Bandits.ipynb).
- See the [Linear Regression Tutorial](cmd_linear_regression.md) for a different look at the roof replacement problem and learn more about Vowpal Wabbit's format and understanding the results.
3 changes: 1 addition & 2 deletions python/tests/confidence_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,5 @@ def lblogwealth(self, *, t, sumXt, v, eta, s, alpha):

return max(
0,
(sumXt - sqrt(gamma1**2 * ll * v + gamma2**2 * ll**2) - gamma2 * ll)
/ t,
(sumXt - sqrt(gamma1**2 * ll * v + gamma2**2 * ll**2) - gamma2 * ll) / t,
)
32 changes: 17 additions & 15 deletions python/tests/crminustwo.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,21 +440,23 @@ def intervaldiff(
candidates.append(
(
gstar,
None
if isclose(kappa, 0)
else {
"kappastar": kappa,
"betastar": beta,
"gammastar": gamma,
"taustar": tau,
"ufake": ufake,
"wfake": wfake,
"rfake": rex,
"qfunc": lambda c, u, w, r, k=kappa, g=gamma, b=beta, t=tau, s=sign, num=n: -c
* (b + g * u + t * w + s * (u - w) * r)
/ ((num + 1) * k),
"mle": mle,
},
(
None
if isclose(kappa, 0)
else {
"kappastar": kappa,
"betastar": beta,
"gammastar": gamma,
"taustar": tau,
"ufake": ufake,
"wfake": wfake,
"rfake": rex,
"qfunc": lambda c, u, w, r, k=kappa, g=gamma, b=beta, t=tau, s=sign, num=n: -c
* (b + g * u + t * w + s * (u - w) * r)
/ ((num + 1) * k),
"mle": mle,
}
),
)
)

Expand Down
6 changes: 3 additions & 3 deletions python/vowpalwabbit/pyvw.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,9 +532,9 @@ def parse(
for ex in str_ex
]
):
str_ex: List[
Example
] = str_ex # pytype: disable=annotation-type-mismatch
str_ex: List[Example] = (
str_ex # pytype: disable=annotation-type-mismatch
)
return str_ex

if not isinstance(str_ex, (list, str)):
Expand Down
29 changes: 29 additions & 0 deletions test/core.vwtest.json
Original file line number Diff line number Diff line change
Expand Up @@ -6073,5 +6073,34 @@
"depends_on": [
467
]
},
{
"id": 469,
"desc": "https://github.com/VowpalWabbit/vowpal_wabbit/issues/4669",
"vw_command": "--ccb_explore_adf --dsjson -d train-sets/issue4669.dsjson -f issue4669.model",
"diff_files": {
"stderr": "train-sets/ref/issue4669_train.stderr",
"stdout": "train-sets/ref/issue4669_train.stdout"
},
"input_files": [
"train-sets/issue4669.dsjson"
]
},
{
"id": 470,
"desc": "https://github.com/VowpalWabbit/vowpal_wabbit/issues/4669",
"vw_command": "--ccb_explore_adf --dsjson --all_slots_loss --epsilon 0 -t -i issue4669.model -t -d train-sets/issue4669.dsjson -p issue4669_test_pred.txt",
"diff_files": {
"stderr": "train-sets/ref/issue4669_test.stderr",
"stdout": "train-sets/ref/issue4669_test.stdout",
"issue4669_test_pred.txt": "train-sets/ref/issue4669_test_pred.txt"
},
"input_files": [
"train-sets/issue4669.dsjson",
"issue4669.model"
],
"depends_on": [
469
]
}
]
16 changes: 10 additions & 6 deletions test/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,21 @@ def _are_same(expected: Any, actual: Any, key: str) -> Tuple[bool, str]:
elif isinstance(expected, (int, bool, str)):
return (
expected == actual,
f"Key '{key}' value mismatch. Expected: '{expected}', but found: '{actual}'"
if expected != actual
else "",
(
f"Key '{key}' value mismatch. Expected: '{expected}', but found: '{actual}'"
if expected != actual
else ""
),
)
elif isinstance(expected, (float)):
delta = abs(expected - actual)
return (
delta < epsilon,
f"Key '{key}' value mismatch. Expected: '{expected}', but found: '{actual}' (using epsilon: '{epsilon}')"
if delta >= epsilon
else "",
(
f"Key '{key}' value mismatch. Expected: '{expected}', but found: '{actual}' (using epsilon: '{epsilon}')"
if delta >= epsilon
else ""
),
)
elif isinstance(expected, dict):
expected_keys = set(expected.keys())
Expand Down
1 change: 1 addition & 0 deletions test/save_resume_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Test that the models generated with and without --predict_only_model produce the same predictions when loaded in test_mode.
"""

import sys
import os
import optparse
Expand Down
Binary file modified test/train-sets/0001.fb
Binary file not shown.
Binary file modified test/train-sets/ccb.fb
Binary file not shown.
Binary file modified test/train-sets/cs.fb
Binary file not shown.
1 change: 1 addition & 0 deletions test/train-sets/issue4669.dsjson
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"c":{"_multi":[{"f":"1"},{"f":"2"}],"_slots":[{"_inc":[0,1]},{"_inc":[1]}]},"_outcomes":[{"_label_cost":1.0,"_a":[0,1],"_p":[0.5,0.5]},{"_label_cost":0.0,"_a":[1],"_p":[1]}]}
Binary file modified test/train-sets/multiclass.fb
Binary file not shown.
Binary file modified test/train-sets/multilabel.fb
Binary file not shown.
Binary file modified test/train-sets/rcv1_cb_eval.fb
Binary file not shown.
Binary file modified test/train-sets/rcv1_raw_cb_small.fb
Binary file not shown.
23 changes: 23 additions & 0 deletions test/train-sets/ref/issue4669_test.stderr
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
only testing
predictions = issue4669_test_pred.txt
using no cache
Reading datafile = train-sets/issue4669.dsjson
num sources = 1
Num weight bits = 18
learning rate = 0.5
initial_t = 1
power_t = 0.5
cb_type = mtr
Enabled learners: gd, generate_interactions, scorer-identity, csoaa_ldf-rank, cb_adf, cb_explore_adf_greedy, cb_sample, shared_feature_merger, ccb_explore_adf
Input label = CCB
Output pred = DECISION_PROBS
average since example example current current current
loss last counter weight label predict features
0.000000 0.000000 1 1.0 0:1,1:0 1,None 9

finished run
number of examples = 1
weighted example sum = 1.000000
weighted label sum = 0.000000
average loss = 0.000000
total feature number = 9
Empty file.
3 changes: 3 additions & 0 deletions test/train-sets/ref/issue4669_test_pred.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
1:1,0:0


22 changes: 22 additions & 0 deletions test/train-sets/ref/issue4669_train.stderr
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
final_regressor = issue4669.model
using no cache
Reading datafile = train-sets/issue4669.dsjson
num sources = 1
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
cb_type = mtr
Enabled learners: gd, generate_interactions, scorer-identity, csoaa_ldf-rank, cb_adf, cb_explore_adf_greedy, cb_sample, shared_feature_merger, ccb_explore_adf
Input label = CCB
Output pred = DECISION_PROBS
average since example example current current current
loss last counter weight label predict features
1.000000 1.000000 1 1.0 0:1,1:0 0,1 12

finished run
number of examples = 1
weighted example sum = 1.000000
weighted label sum = 0.000000
average loss = 1.000000
total feature number = 12
Empty file.
Binary file modified test/train-sets/wiki256_no_label.fb
Binary file not shown.
91 changes: 69 additions & 22 deletions utl/flatbuffer/vw_to_flat.cc
Original file line number Diff line number Diff line change
Expand Up @@ -299,10 +299,10 @@ void to_flat::create_no_label(VW::example* v, ExampleBuilder& ex_builder)
ex_builder.label = VW::parsers::flatbuffer::Createno_label(_builder, (uint8_t)'\000').Union();
}

flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> to_flat::create_namespace(VW::features::audit_iterator begin,
VW::features::audit_iterator end, VW::namespace_index index, uint64_t hash, bool audit)
// Create namespace when audit is true
flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> to_flat::create_namespace_audit(
VW::features::audit_iterator begin, VW::features::audit_iterator end, VW::namespace_index index, uint64_t hash)
{
std::vector<flatbuffers::Offset<VW::parsers::flatbuffer::Feature>> fts;
std::stringstream ss;
ss << index;

Expand All @@ -316,26 +316,61 @@ flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> to_flat::create_namespac
if (find_ns_offset == _share_examples.end())
{
flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> namespace_offset;
std::vector<flatbuffers::Offset<flatbuffers::String>> feature_names;
std::vector<float> feature_values;
std::vector<uint64_t> feature_hashes;

// new namespace
if (audit)

std::string ns_name;
for (auto it = begin; it != end; ++it)
{
std::string ns_name;
for (auto it = begin; it != end; ++it)
{
ns_name = it.audit()->ns;
fts.push_back(
VW::parsers::flatbuffer::CreateFeatureDirect(_builder, it.audit()->name.c_str(), it.value(), it.index()));
}
namespace_offset = VW::parsers::flatbuffer::CreateNamespaceDirect(_builder, ns_name.c_str(), index, &fts, hash);
if ((it.audit()->ns).c_str() != nullptr) ns_name = it.audit()->ns;

(feature_names).push_back(_builder.CreateString(it.audit()->name.c_str()));
(feature_values).push_back(it.value());
(feature_hashes).push_back(it.index());
}
else
namespace_offset = VW::parsers::flatbuffer::CreateNamespaceDirect(
_builder, ns_name.c_str(), index, hash, &feature_names, &feature_values, &feature_hashes);

_share_examples[refid] = namespace_offset;
}

return _share_examples[refid];
}

// Create namespace when audit is false
flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> to_flat::create_namespace(
features::const_iterator begin, features::const_iterator end, VW::namespace_index index, uint64_t hash)

Check warning on line 345 in utl/flatbuffer/vw_to_flat.cc

View workflow job for this annotation

GitHub Actions / asan.macos-latest.vcpkg-ubsan-debug

'features' is deprecated: Moved into VW namespace. Will be removed in VW 10. [-Wdeprecated-declarations]

Check warning on line 345 in utl/flatbuffer/vw_to_flat.cc

View workflow job for this annotation

GitHub Actions / asan.macos-latest.vcpkg-ubsan-debug

'features' is deprecated: Moved into VW namespace. Will be removed in VW 10. [-Wdeprecated-declarations]

Check warning on line 345 in utl/flatbuffer/vw_to_flat.cc

View workflow job for this annotation

GitHub Actions / asan.macos-latest.vcpkg-asan-debug

'features' is deprecated: Moved into VW namespace. Will be removed in VW 10. [-Wdeprecated-declarations]

Check warning on line 345 in utl/flatbuffer/vw_to_flat.cc

View workflow job for this annotation

GitHub Actions / asan.macos-latest.vcpkg-asan-debug

'features' is deprecated: Moved into VW namespace. Will be removed in VW 10. [-Wdeprecated-declarations]
{
std::stringstream ss;
ss << index;

for (auto it = begin; it != end; ++it) { ss << it.index() << it.value(); }
ss << ":" << hash;

std::string s = ss.str();
uint64_t refid = VW::uniform_hash(s.c_str(), s.size(), 0);
const auto find_ns_offset = _share_examples.find(refid);

if (find_ns_offset == _share_examples.end())
{
flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> namespace_offset;
std::vector<float> feature_values;
std::vector<uint64_t> feature_hashes;

for (auto it = begin; it != end; ++it)
{
for (auto it = begin; it != end; ++it)
if (it.value() != 0) // store the feature data only if the value is non zero
{
fts.push_back(VW::parsers::flatbuffer::CreateFeatureDirect(_builder, nullptr, it.value(), it.index()));
(feature_values).push_back(it.value());
(feature_hashes).push_back(it.index());
}
namespace_offset = VW::parsers::flatbuffer::CreateNamespaceDirect(_builder, nullptr, index, &fts, hash);
}
namespace_offset = VW::parsers::flatbuffer::CreateNamespaceDirect(
_builder, nullptr, index, hash, nullptr, &feature_values, &feature_hashes);

_share_examples[refid] = namespace_offset;
}

Expand Down Expand Up @@ -438,13 +473,25 @@ void to_flat::convert_txt_to_flat(VW::workspace& all)
VW::details::flatten_namespace_extents(ae->feature_space[ns].namespace_extents, ae->feature_space[ns].size());
auto unflattened_with_ranges_that_dont_have_extents = unflatten_namespace_extents_dont_skip(flattened_extents);

for (const auto& extent : unflattened_with_ranges_that_dont_have_extents)
if (all.output_config.audit || all.output_config.hash_inv)
{
for (const auto& extent : unflattened_with_ranges_that_dont_have_extents)
{
// The extent hash for a non-hash-extent will be 0, which is the same as the field no existing to flatbuffers.
auto created_ns = create_namespace_audit(ae->feature_space[ns].audit_begin() + extent.begin_index,
ae->feature_space[ns].audit_begin() + extent.end_index, ns, extent.hash);
namespaces.push_back(created_ns);
}
}
else
{
// The extent hash for a non-hash-extent will be 0, which is the same as the field no existing to flatbuffers.
auto created_ns = create_namespace(ae->feature_space[ns].audit_begin() + extent.begin_index,
ae->feature_space[ns].audit_begin() + extent.end_index, ns, extent.hash,
all.output_config.audit || all.output_config.hash_inv);
namespaces.push_back(created_ns);
for (const auto& extent : unflattened_with_ranges_that_dont_have_extents)
{
// The extent hash for a non-hash-extent will be 0, which is the same as the field no existing to flatbuffers.
auto created_ns = create_namespace(ae->feature_space[ns].cbegin() + extent.begin_index,
ae->feature_space[ns].cbegin() + extent.end_index, ns, extent.hash);
namespaces.push_back(created_ns);
}
}
}
std::string tag(ae->tag.begin(), ae->tag.size());
Expand Down
6 changes: 4 additions & 2 deletions utl/flatbuffer/vw_to_flat.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ class to_flat
void write_to_file(bool collection, bool is_multiline, MultiExampleBuilder& multi_ex_builder,
ExampleBuilder& ex_builder, std::ofstream& outfile);

flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> create_namespace(VW::features::audit_iterator begin,
VW::features::audit_iterator end, VW::namespace_index index, uint64_t hash, bool audit);
flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> create_namespace(
VW::features::const_iterator begin, VW::features::const_iterator end, VW::namespace_index index, uint64_t hash);
flatbuffers::Offset<VW::parsers::flatbuffer::Namespace> create_namespace_audit(
VW::features::audit_iterator begin, VW::features::audit_iterator end, VW::namespace_index index, uint64_t hash);
};
Loading

0 comments on commit e52ca46

Please sign in to comment.