Skip to content

Commit

Permalink
algorithm change
Browse files Browse the repository at this point in the history
  • Loading branch information
4kimov committed Sep 10, 2023
1 parent 59ad779 commit ca7b6bf
Show file tree
Hide file tree
Showing 8 changed files with 172 additions and 141 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# CHANGELOG

**v0.2.0:** **⚠️ BREAKING CHANGE**
- **Breaking change**: IDs change. Algorithm has been fine-tuned for better performance [[Issue #11](https://github.com/sqids/sqids-spec/issues/11)]
- `alphabet` cannot contain multibyte characters
- `min_length` upper limit has increased from alphabet length to `255`
- Max blocklist re-encoding attempts has been capped at the length of the alphabet - 1
- Minimum alphabet length has changed from 5 to 3
- `min_value()` and `max_value()` functions have been removed

**v0.1.2:**
- Bug fix: spec update (PR #7): blocklist filtering in uppercase-only alphabet [[PR #7](https://github.com/sqids/sqids-spec/pull/7)]
- Lower uniques test from 1_000_000 to 10_000
Expand Down
18 changes: 9 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,34 +55,34 @@ Simple encode & decode:

```ruby
sqids = Sqids.new
id = sqids.encode([1, 2, 3]) # '8QRLaD'
id = sqids.encode([1, 2, 3]) # '86Rf07'
numbers = sqids.decode(id) # [1, 2, 3]
```

> **Note**
> 🚧 Because of the algorithm's design, **multiple IDs can decode back into the same sequence of numbers**. If it's important to your design that IDs are canonical, you have to manually re-encode decoded numbers and check that the generated ID matches.
Randomize IDs by providing a custom alphabet:
Enforce a *minimum* length for IDs:

```ruby
sqids = Sqids.new(alphabet: 'FxnXM1kBN6cuhsAvjW3Co7l2RePyY8DwaU04Tzt9fHQrqSVKdpimLGIJOgb5ZE')
id = sqids.encode([1, 2, 3]) # 'B5aMa3'
sqids = Sqids.new(min_length: 10)
id = sqids.encode([1, 2, 3]) # '86Rf07xd4z'
numbers = sqids.decode(id) # [1, 2, 3]
```

Enforce a *minimum* length for IDs:
Randomize IDs by providing a custom alphabet:

```ruby
sqids = Sqids.new(min_length: 10)
id = sqids.encode([1, 2, 3]) # '75JT1cd0dL'
sqids = Sqids.new(alphabet: 'FxnXM1kBN6cuhsAvjW3Co7l2RePyY8DwaU04Tzt9fHQrqSVKdpimLGIJOgb5ZE')
id = sqids.encode([1, 2, 3]) # 'B4aajs'
numbers = sqids.decode(id) # [1, 2, 3]
```

Prevent specific words from appearing anywhere in the auto-generated IDs:

```ruby
sqids = Sqids.new(blocklist: Set.new(%w[word1 word2]))
id = sqids.encode([1, 2, 3]) # '8QRLaD'
sqids = Sqids.new(blocklist: Set.new(%w[86Rf07]))
id = sqids.encode([1, 2, 3]) # 'se8ojk'
numbers = sqids.decode(id) # [1, 2, 3]
```

Expand Down
93 changes: 37 additions & 56 deletions lib/sqids.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,18 @@ def initialize(options = {})
min_length = options[:min_length] || DEFAULT_MIN_LENGTH
blocklist = options[:blocklist] || DEFAULT_BLOCKLIST

raise ArgumentError, 'Alphabet length must be at least 5' if alphabet.length < 5
raise ArgumentError, 'Alphabet cannot contain multibyte characters' if contains_multibyte_chars(alphabet)
raise ArgumentError, 'Alphabet length must be at least 3' if alphabet.length < 3

if alphabet.chars.uniq.size != alphabet.length
raise ArgumentError,
'Alphabet must contain unique characters'
end

unless min_length.is_a?(Integer) && min_length >= Sqids.min_value && min_length <= alphabet.length
min_length_limit = 255
unless min_length.is_a?(Integer) && min_length >= 0 && min_length <= min_length_limit
raise TypeError,
"Minimum length has to be between #{Sqids.min_value} and #{alphabet.length}"
"Minimum length has to be between 0 and #{min_length_limit}"
end

filtered_blocklist = blocklist.select do |word|
Expand All @@ -39,13 +41,13 @@ def initialize(options = {})
def encode(numbers)
return '' if numbers.empty?

in_range_numbers = numbers.select { |n| n >= Sqids.min_value && n <= Sqids.max_value }
in_range_numbers = numbers.select { |n| n >= 0 && n <= Sqids.max_value }
unless in_range_numbers.length == numbers.length
raise ArgumentError,
"Encoding supports numbers between #{Sqids.min_value} and #{Sqids.max_value}"
"Encoding supports numbers between 0 and #{Sqids.max_value}"
end

encode_numbers(in_range_numbers, partitioned: false)
encode_numbers(in_range_numbers)
end

def decode(id)
Expand All @@ -61,26 +63,18 @@ def decode(id)
prefix = id[0]
offset = @alphabet.index(prefix)
alphabet = @alphabet.slice(offset, @alphabet.length) + @alphabet.slice(0, offset)
partition = alphabet[1]
alphabet = alphabet.slice(2, alphabet.length)
alphabet = alphabet.reverse

id = id[1, id.length]

partition_index = id.index(partition)
if !partition_index.nil? && partition_index.positive? && partition_index < id.length - 1
id = id[partition_index + 1, id.length]
alphabet = shuffle(alphabet)
end

while id.length.positive?
separator = alphabet[-1]
chunks = id.split(separator, 2)
separator = alphabet[0]

chunks = id.split(separator, 2)
if chunks.any?
alphabet_without_separator = alphabet.slice(0, alphabet.length - 1)
return [] unless chunks[0].chars.all? { |c| alphabet_without_separator.include?(c) }
return ret if chunks[0] == ''

ret.push(to_number(chunks[0], alphabet_without_separator))
ret.push(to_number(chunks[0], alphabet.slice(1, alphabet.length - 1)))
alphabet = shuffle(alphabet) if chunks.length > 1
end

Expand All @@ -90,14 +84,6 @@ def decode(id)
ret
end

def self.min_value
0
end

def self.max_value
defined?(Integer::MAX) ? Integer::MAX : ((2**((0.size * 8) - 2)) - 1)
end

private

def shuffle(alphabet)
Expand All @@ -115,59 +101,42 @@ def shuffle(alphabet)
chars.join
end

def encode_numbers(numbers, partitioned: false)
def encode_numbers(numbers, increment: 0)
raise ArgumentError, 'Reached max attempts to re-generate the ID' if increment > @alphabet.length

offset = numbers.length
numbers.each_with_index do |v, i|
offset += @alphabet[v % @alphabet.length].ord + i
end
offset = offset % @alphabet.length
offset = (offset + increment) % @alphabet.length

alphabet = @alphabet.slice(offset, @alphabet.length) + @alphabet.slice(0, offset)
prefix = alphabet[0]
partition = alphabet[1]
alphabet = alphabet.slice(2, alphabet.length)
alphabet = alphabet.reverse
ret = [prefix]

numbers.each_with_index do |num, i|
alphabet_without_separator = alphabet.slice(0, alphabet.length - 1)
ret.push(to_id(num, alphabet_without_separator))
ret.push(to_id(num, alphabet.slice(1, alphabet.length - 1)))

next unless i < numbers.length - 1

separator = alphabet[-1]
if partitioned && i.zero?
ret.push(partition)
else
ret.push(separator)
end

ret.push(alphabet[0])
alphabet = shuffle(alphabet)
end

id = ret.join

if @min_length > id.length
unless partitioned
numbers = [0] + numbers
id = encode_numbers(numbers, partitioned: true)
end
id += alphabet[0]

if @min_length > id.length
id = id[0] + alphabet[0,
@min_length - id.length] + id[1,
id.length - 1]
while (@min_length - id.length).positive?
alphabet = shuffle(alphabet)
id += alphabet.slice(0, [@min_length - id.length, alphabet.length].min)
end
end

if blocked_id?(id)
if partitioned
numbers[0] += 1
else
numbers = [0] + numbers
end

id = encode_numbers(numbers, partitioned: true)
end
id = encode_numbers(numbers, increment: increment + 1) if blocked_id?(id)

id
end
Expand Down Expand Up @@ -206,4 +175,16 @@ def blocked_id?(id)
end
end
end

def contains_multibyte_chars(input_str)
input_str.each_char do |char|
return true if char.bytesize > 1
end

false
end

def self.max_value
defined?(Integer::MAX) ? Integer::MAX : ((2**((0.size * 8) - 2)) - 1)
end
end
12 changes: 9 additions & 3 deletions spec/alphabet_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
sqids = Sqids.new(alphabet: '0123456789abcdef')

numbers = [1, 2, 3]
id = '4d9fd2'
id = '489158'

expect(sqids.encode(numbers)).to eq(id)
expect(sqids.decode(id)).to eq(numbers)
end

it 'decodes after encoding with a short alphabet' do
sqids = Sqids.new(alphabet: 'abcde')
sqids = Sqids.new(alphabet: 'abc')

numbers = [1, 2, 3]
encoded = sqids.encode(numbers)
Expand All @@ -33,6 +33,12 @@
expect(sqids.decode(encoded)).to eq(numbers)
end

it 'fails when alphabet has multibyte characters' do
expect do
Sqids.new(alphabet: 'ë1092')
end.to raise_error(ArgumentError)
end

it 'fails when alphabet characters are repeated' do
expect do
Sqids.new(alphabet: 'aabcdefg')
Expand All @@ -41,7 +47,7 @@

it 'fails when alphabet is too short' do
expect do
Sqids.new(alphabet: 'abcd')
Sqids.new(alphabet: 'ab')
end.to raise_error(ArgumentError)
end
end
59 changes: 37 additions & 22 deletions spec/blocklist_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,59 +8,74 @@
it 'uses default blocklist if no custom blocklist is provided' do
sqids = Sqids.new

expect(sqids.decode('sexy')).to eq([200_044])
expect(sqids.encode([200_044])).to eq('d171vI')
expect(sqids.decode('aho1e')).to eq([4_572_721])
expect(sqids.encode([4_572_721])).to eq('JExTR')
end

it 'does not use any blocklist if an empty blocklist is provided' do
sqids = Sqids.new(blocklist: Set.new([]))

expect(sqids.decode('sexy')).to eq([200_044])
expect(sqids.encode([200_044])).to eq('sexy')
expect(sqids.decode('aho1e')).to eq([4_572_721])
expect(sqids.encode([4_572_721])).to eq('aho1e')
end

it 'uses provided blocklist if non-empty blocklist is provided' do
sqids = Sqids.new(blocklist: Set.new(['AvTg']))
sqids = Sqids.new(blocklist: Set.new(['ArUO']))

expect(sqids.decode('sexy')).to eq([200_044])
expect(sqids.encode([200_044])).to eq('sexy')
expect(sqids.decode('aho1e')).to eq([4_572_721])
expect(sqids.encode([4_572_721])).to eq('aho1e')

expect(sqids.decode('AvTg')).to eq([100_000])
expect(sqids.encode([100_000])).to eq('7T1X8k')
expect(sqids.decode('7T1X8k')).to eq([100_000])
expect(sqids.decode('ArUO')).to eq([100_000])
expect(sqids.encode([100_000])).to eq('QyG4')
expect(sqids.decode('QyG4')).to eq([100_000])
end

it 'uses blocklist to prevent certain encodings' do
sqids = Sqids.new(blocklist: Set.new(%w[8QRLaD 7T1cd0dL UeIe imhw LfUQ]))
sqids = Sqids.new(blocklist: Set.new(%w[JSwXFaosAN OCjV9JK64o rBHf 79SM 7tE6]))

expect(sqids.encode([1, 2, 3])).to eq('TM0x1Mxz')
expect(sqids.decode('TM0x1Mxz')).to eq([1, 2, 3])
expect(sqids.encode([1_000_000, 2_000_000])).to eq('1aYeB7bRUt')
expect(sqids.decode('1aYeB7bRUt')).to eq([1_000_000, 2_000_000])
end

it 'can decode blocklist words' do
sqids = Sqids.new(blocklist: Set.new(%w[8QRLaD 7T1cd0dL RA8UeIe7 WM3Limhw LfUQh4HN]))
sqids = Sqids.new(blocklist: Set.new(%w[86Rf07 se8ojk ARsz1p Q8AI49 5sQRZO]))

expect(sqids.decode('8QRLaD')).to eq([1, 2, 3])
expect(sqids.decode('7T1cd0dL')).to eq([1, 2, 3])
expect(sqids.decode('RA8UeIe7')).to eq([1, 2, 3])
expect(sqids.decode('WM3Limhw')).to eq([1, 2, 3])
expect(sqids.decode('LfUQh4HN')).to eq([1, 2, 3])
expect(sqids.decode('86Rf07')).to eq([1, 2, 3])
expect(sqids.decode('se8ojk')).to eq([1, 2, 3])
expect(sqids.decode('ARsz1p')).to eq([1, 2, 3])
expect(sqids.decode('Q8AI49')).to eq([1, 2, 3])
expect(sqids.decode('5sQRZO')).to eq([1, 2, 3])
end

it 'matches against a short blocklist word' do
sqids = Sqids.new(blocklist: Set.new(['pPQ']))
sqids = Sqids.new(blocklist: Set.new(['pnd']))

expect(sqids.decode(sqids.encode([1_000]))).to eq([1_000])
end

it 'blocklist filtering in constructor' do
# lowercase blocklist in only-uppercase alphabet
sqids = Sqids.new(alphabet: 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', blocklist: Set.new(['sqnmpn']))
sqids = Sqids.new(alphabet: 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', blocklist: Set.new(['sxnzkl']))

id = sqids.encode([1, 2, 3])
numbers = sqids.decode(id)

expect(id).to eq('ULPBZGBM') # without blocklist, would've been "SQNMPN"
expect(id).to eq('IBSHOZ') # without blocklist, would've been "SXNZKL"
expect(numbers).to eq([1, 2, 3])
end

it 'max encoding attempts' do
alphabet = 'abc'
min_length = 3
blocklist = Set.new(%w[cab abc bca])

sqids = Sqids.new(alphabet: alphabet, min_length: min_length, blocklist: blocklist)

expect(min_length).to eq(alphabet.length)
expect(min_length).to eq(blocklist.size)

expect do
sqids.encode([0])
end.to raise_error(ArgumentError)
end
end
Loading

0 comments on commit ca7b6bf

Please sign in to comment.