LLM for Unity  v2.4.2
Create characters in Unity with LLMs!
Loading...
Searching...
No Matches
TokenSplitter.cs
Go to the documentation of this file.
1
3using System;
4using System.Collections.Generic;
5using System.Threading.Tasks;
6using UnityEngine;
7
8namespace LLMUnity
9{
14 [Serializable]
15 public class TokenSplitter : Chunking
16 {
18 [Tooltip("number of tokens by which to split phrases into chunks")]
19 public int numTokens = 10;
20
21 protected int DetermineEndIndex(string input, string detokenised, int startIndex, int searchRange = 5, int charsFromEnd = 3)
22 {
23 int endIndex = Math.Min(input.Length - 1, startIndex + detokenised.Length - 1);
24 if (endIndex == input.Length - 1) return endIndex;
25
26 for (int lastCharI = 0; lastCharI < charsFromEnd; lastCharI++)
27 {
28 int charI = detokenised.Length - 1 - lastCharI;
29 if (charI < 0) break;
30 char lastChar = detokenised[charI];
31
32 for (int i = 0; i < searchRange; i++)
33 {
34 foreach (int mul in new int[] {-1, 1})
35 {
36 int inputCharI = endIndex + mul * i;
37 if (inputCharI < 0 || inputCharI > input.Length - 1) continue;
38 if (input[inputCharI] == lastChar) return inputCharI;
39 }
40 }
41 }
42 return endIndex;
43 }
44
50 public override async Task<List<(int, int)>> Split(string input)
51 {
52 List<(int, int)> indices = new List<(int, int)>();
53 List<int> tokens = await search.Tokenize(input);
54 if (tokens.Count == 0) return indices;
55
56 int startIndex = 0;
57 for (int i = 0; i < tokens.Count; i += numTokens)
58 {
59 int batchTokens = Math.Min(tokens.Count, i + numTokens) - i;
60 string detokenised = await search.Detokenize(tokens.GetRange(i, batchTokens));
61 int endIndex = DetermineEndIndex(input, detokenised, startIndex);
62 indices.Add((startIndex, endIndex));
63 startIndex = endIndex + 1;
64 if (endIndex == input.Length - 1) break;
65 }
66 if (startIndex <= input.Length - 1) indices.Add((startIndex, input.Length - 1));
67 return indices;
68 }
69 }
70}
Class implementing the chunking functionality.
Definition Chunking.cs:18
Class implementing a token-based splitter.
int numTokens
number of tokens by which to split phrases into chunks
override async Task< List<(int, int)> > Split(string input)
Splits the provided phrase into chunks of a specific number of tokens (defined by the numTokens varia...