1 /*
2  * Copyright (C) 2013, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "dictionary/utils/forgetting_curve_utils.h"
18 
19 #include <algorithm>
20 #include <cmath>
21 #include <stdlib.h>
22 
23 #include "dictionary/header/header_policy.h"
24 #include "dictionary/utils/probability_utils.h"
25 #include "utils/time_keeper.h"
26 
27 namespace latinime {
28 
29 const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8;
30 const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60;
31 
32 const int ForgettingCurveUtils::MAX_LEVEL = 15;
33 const int ForgettingCurveUtils::MIN_VISIBLE_LEVEL = 2;
34 const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 31;
35 const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 30;
36 const int ForgettingCurveUtils::OCCURRENCES_TO_RAISE_THE_LEVEL = 1;
37 // TODO: Evaluate whether this should be 7.5 days.
38 // 15 days
39 const int ForgettingCurveUtils::DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS = 15 * 24 * 60 * 60;
40 
41 const float ForgettingCurveUtils::ENTRY_COUNT_HARD_LIMIT_WEIGHT = 1.2;
42 
43 const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable;
44 
45 // TODO: Revise the logic to decide the initial probability depending on the given probability.
createUpdatedHistoricalInfo(const HistoricalInfo * const originalHistoricalInfo,const int newProbability,const HistoricalInfo * const newHistoricalInfo,const HeaderPolicy * const headerPolicy)46 /* static */ const HistoricalInfo ForgettingCurveUtils::createUpdatedHistoricalInfo(
47         const HistoricalInfo *const originalHistoricalInfo, const int newProbability,
48         const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy) {
49     const int timestamp = newHistoricalInfo->getTimestamp();
50     if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) {
51         // Add entry as a valid word.
52         const int level = clampToVisibleEntryLevelRange(newHistoricalInfo->getLevel());
53         const int count = clampToValidCountRange(newHistoricalInfo->getCount(), headerPolicy);
54         return HistoricalInfo(timestamp, level, count);
55     } else if (!originalHistoricalInfo->isValid()
56             || originalHistoricalInfo->getLevel() < newHistoricalInfo->getLevel()
57             || (originalHistoricalInfo->getLevel() == newHistoricalInfo->getLevel()
58                     && originalHistoricalInfo->getCount() < newHistoricalInfo->getCount())) {
59         // Initial information.
60         int count = newHistoricalInfo->getCount();
61         if (count >= OCCURRENCES_TO_RAISE_THE_LEVEL) {
62             const int level = clampToValidLevelRange(newHistoricalInfo->getLevel() + 1);
63             return HistoricalInfo(timestamp, level, 0 /* count */);
64         }
65         const int level = clampToValidLevelRange(newHistoricalInfo->getLevel());
66         return HistoricalInfo(timestamp, level, clampToValidCountRange(count, headerPolicy));
67     } else {
68         const int updatedCount = originalHistoricalInfo->getCount() + 1;
69         if (updatedCount >= OCCURRENCES_TO_RAISE_THE_LEVEL) {
70             // The count exceeds the max value the level can be incremented.
71             if (originalHistoricalInfo->getLevel() >= MAX_LEVEL) {
72                 // The level is already max.
73                 return HistoricalInfo(timestamp,
74                         originalHistoricalInfo->getLevel(), originalHistoricalInfo->getCount());
75             } else {
76                 // Raise the level.
77                 return HistoricalInfo(timestamp,
78                         originalHistoricalInfo->getLevel() + 1, 0 /* count */);
79             }
80         } else {
81             return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel(), updatedCount);
82         }
83     }
84 }
85 
decodeProbability(const HistoricalInfo * const historicalInfo,const HeaderPolicy * const headerPolicy)86 /* static */ int ForgettingCurveUtils::decodeProbability(
87         const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) {
88     const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimestamp(),
89             DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS);
90     return sProbabilityTable.getProbability(
91             headerPolicy->getForgettingCurveProbabilityValuesTableId(),
92             clampToValidLevelRange(historicalInfo->getLevel()),
93             clampToValidTimeStepCountRange(elapsedTimeStepCount));
94 }
95 
needsToKeep(const HistoricalInfo * const historicalInfo,const HeaderPolicy * const headerPolicy)96 /* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo,
97         const HeaderPolicy *const headerPolicy) {
98     return historicalInfo->getLevel() > 0
99             || getElapsedTimeStepCount(historicalInfo->getTimestamp(),
100                     DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS)
101                             < DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
102 }
103 
createHistoricalInfoToSave(const HistoricalInfo * const originalHistoricalInfo,const HeaderPolicy * const headerPolicy)104 /* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave(
105         const HistoricalInfo *const originalHistoricalInfo,
106         const HeaderPolicy *const headerPolicy) {
107     if (originalHistoricalInfo->getTimestamp() == NOT_A_TIMESTAMP) {
108         return HistoricalInfo();
109     }
110     const int durationToLevelDownInSeconds = DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS;
111     const int elapsedTimeStep = getElapsedTimeStepCount(
112             originalHistoricalInfo->getTimestamp(), durationToLevelDownInSeconds);
113     if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) {
114         // No need to update historical info.
115         return *originalHistoricalInfo;
116     }
117     // Lower the level.
118     const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1);
119     const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ?
120             originalHistoricalInfo->getLevel() : maxLevelDownAmonut;
121     const int adjustedTimestampInSeconds = originalHistoricalInfo->getTimestamp() +
122             levelDownAmount * durationToLevelDownInSeconds;
123     return HistoricalInfo(adjustedTimestampInSeconds,
124             originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */);
125 }
126 
needsToDecay(const bool mindsBlockByDecay,const EntryCounts & entryCounts,const HeaderPolicy * const headerPolicy)127 /* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay,
128         const EntryCounts &entryCounts, const HeaderPolicy *const headerPolicy) {
129     const EntryCounts &maxNgramCounts = headerPolicy->getMaxNgramCounts();
130     for (const auto ngramType : AllNgramTypes::ASCENDING) {
131         if (entryCounts.getNgramCount(ngramType)
132                 >= getEntryCountHardLimit(maxNgramCounts.getNgramCount(ngramType))) {
133             // Unigram count exceeds the limit.
134             return true;
135         }
136     }
137     if (mindsBlockByDecay) {
138         return false;
139     }
140     if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS
141             < TimeKeeper::peekCurrentTime()) {
142         // Time to decay.
143         return true;
144     }
145     return false;
146 }
147 
148 // See comments in ProbabilityUtils::backoff().
backoff(const int unigramProbability)149 /* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) {
150     // See TODO comments in ForgettingCurveUtils::getProbability().
151     return unigramProbability;
152 }
153 
getElapsedTimeStepCount(const int timestamp,const int durationToLevelDownInSeconds)154 /* static */ int ForgettingCurveUtils::getElapsedTimeStepCount(const int timestamp,
155         const int durationToLevelDownInSeconds) {
156     const int elapsedTimeInSeconds = TimeKeeper::peekCurrentTime() - timestamp;
157     const int timeStepDurationInSeconds =
158             durationToLevelDownInSeconds / (MAX_ELAPSED_TIME_STEP_COUNT + 1);
159     return elapsedTimeInSeconds / timeStepDurationInSeconds;
160 }
161 
clampToVisibleEntryLevelRange(const int level)162 /* static */ int ForgettingCurveUtils::clampToVisibleEntryLevelRange(const int level) {
163     return std::min(std::max(level, MIN_VISIBLE_LEVEL), MAX_LEVEL);
164 }
165 
clampToValidCountRange(const int count,const HeaderPolicy * const headerPolicy)166 /* static */ int ForgettingCurveUtils::clampToValidCountRange(const int count,
167         const HeaderPolicy *const headerPolicy) {
168     return std::min(std::max(count, 0), OCCURRENCES_TO_RAISE_THE_LEVEL - 1);
169 }
170 
clampToValidLevelRange(const int level)171 /* static */ int ForgettingCurveUtils::clampToValidLevelRange(const int level) {
172     return std::min(std::max(level, 0), MAX_LEVEL);
173 }
174 
clampToValidTimeStepCountRange(const int timeStepCount)175 /* static */ int ForgettingCurveUtils::clampToValidTimeStepCountRange(const int timeStepCount) {
176     return std::min(std::max(timeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT);
177 }
178 
179 const int ForgettingCurveUtils::ProbabilityTable::PROBABILITY_TABLE_COUNT = 4;
180 const int ForgettingCurveUtils::ProbabilityTable::WEAK_PROBABILITY_TABLE_ID = 0;
181 const int ForgettingCurveUtils::ProbabilityTable::MODEST_PROBABILITY_TABLE_ID = 1;
182 const int ForgettingCurveUtils::ProbabilityTable::STRONG_PROBABILITY_TABLE_ID = 2;
183 const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_PROBABILITY_TABLE_ID = 3;
184 const int ForgettingCurveUtils::ProbabilityTable::WEAK_MAX_PROBABILITY = 127;
185 const int ForgettingCurveUtils::ProbabilityTable::MODEST_BASE_PROBABILITY = 8;
186 const int ForgettingCurveUtils::ProbabilityTable::STRONG_BASE_PROBABILITY = 9;
187 const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_BASE_PROBABILITY = 10;
188 
189 
ProbabilityTable()190 ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() {
191     mTables.resize(PROBABILITY_TABLE_COUNT);
192     for (int tableId = 0; tableId < PROBABILITY_TABLE_COUNT; ++tableId) {
193         mTables[tableId].resize(MAX_LEVEL + 1);
194         for (int level = 0; level <= MAX_LEVEL; ++level) {
195             mTables[tableId][level].resize(MAX_ELAPSED_TIME_STEP_COUNT + 1);
196             const float initialProbability = getBaseProbabilityForLevel(tableId, level);
197             const float endProbability = getBaseProbabilityForLevel(tableId, level - 1);
198             for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT;
199                     ++timeStepCount) {
200                 if (level < MIN_VISIBLE_LEVEL) {
201                     mTables[tableId][level][timeStepCount] = NOT_A_PROBABILITY;
202                     continue;
203                 }
204                 const float probability = initialProbability
205                         * powf(initialProbability / endProbability,
206                                 -1.0f * static_cast<float>(timeStepCount)
207                                         / static_cast<float>(MAX_ELAPSED_TIME_STEP_COUNT + 1));
208                 mTables[tableId][level][timeStepCount] =
209                         std::min(std::max(static_cast<int>(probability), 1), MAX_PROBABILITY);
210             }
211         }
212     }
213 }
214 
getBaseProbabilityForLevel(const int tableId,const int level)215 /* static */ int ForgettingCurveUtils::ProbabilityTable::getBaseProbabilityForLevel(
216         const int tableId, const int level) {
217     if (tableId == WEAK_PROBABILITY_TABLE_ID) {
218         // Max probability is 127.
219         return static_cast<float>(WEAK_MAX_PROBABILITY / (1 << (MAX_LEVEL - level)));
220     } else if (tableId == MODEST_PROBABILITY_TABLE_ID) {
221         // Max probability is 128.
222         return static_cast<float>(MODEST_BASE_PROBABILITY * (level + 1));
223     } else if (tableId == STRONG_PROBABILITY_TABLE_ID) {
224         // Max probability is 140.
225         return static_cast<float>(STRONG_BASE_PROBABILITY * (level + 1));
226     } else if (tableId == AGGRESSIVE_PROBABILITY_TABLE_ID) {
227         // Max probability is 160.
228         return static_cast<float>(AGGRESSIVE_BASE_PROBABILITY * (level + 1));
229     } else {
230         return NOT_A_PROBABILITY;
231     }
232 }
233 
234 } // namespace latinime
235