-
Notifications
You must be signed in to change notification settings - Fork 28
/
generation.sql
240 lines (212 loc) · 10.9 KB
/
generation.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
/**
Copyright 2024 Google LLC
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
CREATE OR REPLACE FUNCTION `[DATASET]`.TitlesPrompt(
LANGUAGE STRING,
EXAMPLES ARRAY<STRUCT<id STRING, properties STRING, title STRING, description STRING>>,
PROPERTIES ARRAY<STRING>) AS (
CONCAT(
"""You are a leading digital marketer working for a top retail organisation.
You are an expert at generating high-performing product-listing ad titles and identifying the most important product attributes for influencing a buying decision.
Given the input product data below, for each described product generate a title in """,
LANGUAGE,
""". Adhere to the following rules:
1) Put each title on a separate output line, in the same order as the input, and prepended with the product's ID.
2) Don't make this a numbered list or a list with dashes: each title must consist of pure text, without any formatting characters.
3) Do not prepend your output with a headline announcing what's following.
4) Each title must list product attributes, should not exceed 20 words warranted by the product data, among them no duplicates.
5) If there is a named size attribute, prefix its value with the word for "Size" in the requested language and replace long identifiers with their usual abbreviations. (E.g. for English, this means Small, Medium, Large and X-Large are to be replaced by S, M, L and XL, respectively.)
6) Product attributes should be enumerated with commas, as seen in the examples, but not vertical bars, dashes or parentheses.
7) Write dimension without spaces, i.e. do not use "10 x 5 cm", but instead "10×5cm".
Let's first look at some examples of how to write good titles:""",
"\n\nExample input product data:\n\n", ARRAY_TO_STRING(
(SELECT ARRAY_AGG(properties) FROM UNNEST(EXAMPLES)), '\n', ''),
"\n\nExample output product titles (adhering to all seven rules):\n\n", ARRAY_TO_STRING(
(SELECT ARRAY_AGG(CONCAT(id, ': ', title)) FROM UNNEST(EXAMPLES)), '\n', ''),
"""\n\nBefore getting to the actual task at hand, let's remember the rules by looking at some bad examples for titles and how they would be corrected:
- "ABC's Hamburger; great hamburger for evenings; with 200g meat, ketchup & salad" – this violates rule 3, as it has duplication and makes claims that are not objective attributes, and rule 6, as it uses semicolons instead of commas. Better: "ABC's Hamburger, with 200g meat, ketchup & salad"
- "Company dishwasher DW45, 50 x 50 x 70 cm, (1231254)" – this violates rule 4, as it mentions a useless ID, and rule 7, as it uses spaces inside the dimensions. Better: "Company dishwasher DW45, 50×50×70cm"
- "Fast runners' shoes, Xtra-large, beige, vegan leather" – this violates rule 5, as the named size attribute is not prefixed, nor abbreviated. Better: "Fast runners' shoes, Size XL, beige, vegan leather"
- "Woodpecker – Night-time Bed, 210 x 100cm, birch, reinforced frame" – this violates rule 6, as it separates the brand from the product name with a dash instead of a comma, and rule 7, as it has spaces between the dimensions. Better: "Woodpecker Night-time Bed, 210×100cm, birch, reinforced frame"
- "Tapy Tape Roll Pink – 500 x 5 cm, 1 St" – this violates rule 6, as it separates the brand from the product name with a dash instead of a comma, and rule 7, as it has spaces between the dimensions. Better: "Tapy Tape Roll, pink, 500×5cm, 1 St"
Now let's tackle the actual task at hand:""",
"\n\nActual input product data:\n\n", ARRAY_TO_STRING(PROPERTIES, '\n', ''),
"\n\nActual output product titles (adhering to all seven rules):\n\n"
)
);
CREATE OR REPLACE FUNCTION `[DATASET]`.DescriptionsPrompt(
LANGUAGE STRING,
EXAMPLES ARRAY<STRUCT<id STRING, properties STRING, title STRING, description STRING>>,
PROPERTIES ARRAY<STRING>) AS (
CONCAT(
"""You are a leading digital marketer working for a top retail organisation.
You are an expert at generating product descriptions that increase the likelihood for the product to be bought, without misleading the customer.
Given the product data below, for each described product generate a description in grammatically correct """,
LANGUAGE,
""", each one on a separate output line, in the same order as the input, and prepended with the product's ID.
Don't make this a numbered list or a list with dashes: each description must consist of pure text, without any formatting characters.
Do not prepend your output with a headline announcing what's following.
The description should be detailed, but should not exceed 100 words, and only contain claims justifiable from the product data.
Before looking at the actual product data to be processed, let's look at some examples:""",
"\n\nExample input product data:\n\n", ARRAY_TO_STRING(
(SELECT ARRAY_AGG(properties) FROM UNNEST(EXAMPLES)), '\n', ''),
"\n\nExample output product descriptions (in the same order as the input, prepended with the respective ID, but without headline, without empty lines, without indentation, without leading dashes):\n\n", ARRAY_TO_STRING(
(SELECT ARRAY_AGG(CONCAT(id, ': ', description)) FROM UNNEST(EXAMPLES)), '\n', ''),
"\n\nNow let's tackle the actual task at hand:",
"\n\nActual input product data:\n\n", ARRAY_TO_STRING(PROPERTIES, '\n', ''),
"\n\nActual output product descriptions (in the same order as the input, prepended with the respective ID, but without headline, without empty lines, without indentation, without leading dashes):\n\n"
)
);
CREATE OR REPLACE PROCEDURE `[DATASET].BatchedUpdateTitles`(
ITEMS_PER_PROMPT INT64,
LANGUAGE STRING,
PARTS INT64,
PART INT64,
IDS ARRAY<STRING>)
OPTIONS(strict_mode=FALSE) -- Don't abort if tables don't yet exist.
BEGIN
DECLARE EXAMPLES ARRAY<STRUCT<id STRING, properties STRING, title STRING, description STRING>> DEFAULT (
SELECT ARRAY_AGG(Examples) FROM `[DATASET]`.Examples
);
LOOP
IF (
SELECT COUNT(*) = 0 AND IDS IS NULL
FROM `[DATASET]`.Output
WHERE title IS NULL AND tries < 3
AND (PARTS IS NULL OR ABS(MOD(FARM_FINGERPRINT(id), PARTS)) = PART)
) THEN LEAVE;
END IF;
-- Generate prompts
CREATE OR REPLACE TEMP TABLE Prompts AS
WITH
Input AS (
SELECT id, TO_JSON_STRING(I) AS properties
FROM `[DATASET]`.Output AS O
INNER JOIN `[DATASET]`.InputProcessing AS I USING (id)
WHERE (PARTS IS NULL OR ABS(MOD(FARM_FINGERPRINT(id), PARTS)) = PART)
AND IF(IDS IS NOT NULL,
O.id IN UNNEST(IDS),
O.title IS NULL AND O.tries < 3)
ORDER BY RAND()
LIMIT 600 -- TODO: Find out how to use a parameter ITEMS_PER_ITERATION here.
),
Numbered AS (
SELECT id, properties, ROW_NUMBER() OVER (ORDER BY id) - 1 AS row_id
FROM Input
)
SELECT
DIV(row_id, ITEMS_PER_PROMPT) AS chunk_id,
`[DATASET]`.TitlesPrompt(LANGUAGE, EXAMPLES, ARRAY_AGG(properties ORDER BY id)) AS prompt,
ARRAY_AGG(id ORDER BY id) AS ids
FROM Numbered
GROUP BY 1;
-- Generate titles
CREATE OR REPLACE TEMP TABLE Generated AS
SELECT ids, COALESCE(SPLIT(ml_generate_text_llm_result, '\n'), ids) AS output,
FROM
ML.GENERATE_TEXT(
MODEL `[DATASET]`.GeminiFlash,
TABLE Prompts,
STRUCT(
0.1 AS temperature,
2048 AS max_output_tokens,
TRUE AS flatten_json_output));
-- Store generated titles in output feed
MERGE `[DATASET]`.Output AS O
USING (
SELECT
COALESCE(REGEXP_EXTRACT(output, r'^([^:]+): .*'), REGEXP_EXTRACT(output, r'^([^:]+)$')) AS id,
REGEXP_EXTRACT(output, r'^[^:]+: (.*)$') AS title
FROM Generated AS G
CROSS JOIN G.output
QUALIFY ROW_NUMBER() OVER (PARTITION BY id) = 1 AND id IN UNNEST(G.ids)
) AS G
ON O.id = G.id
WHEN MATCHED THEN UPDATE SET
O.title = IFNULL(G.title, O.title),
O.tries = O.tries + 1;
IF IDS IS NOT NULL THEN LEAVE;
END IF;
END LOOP;
END;
CREATE OR REPLACE PROCEDURE `[DATASET].BatchedUpdateDescriptions`(
ITEMS_PER_PROMPT INT64,
LANGUAGE STRING,
PARTS INT64,
PART INT64,
IDS ARRAY<STRING>)
OPTIONS(strict_mode=FALSE) -- Don't abort if tables don't yet exist.
BEGIN
DECLARE EXAMPLES ARRAY<STRUCT<id STRING, properties STRING, title STRING, description STRING>> DEFAULT (
SELECT ARRAY_AGG(Examples) FROM `[DATASET]`.Examples
);
LOOP
IF (
SELECT COUNT(*) = 0 AND IDS IS NULL
FROM `[DATASET]`.Output
WHERE description IS NULL AND tries < 3
AND (PARTS IS NULL OR ABS(MOD(FARM_FINGERPRINT(id), PARTS)) = PART)
) THEN LEAVE;
END IF;
-- Generate prompts
CREATE OR REPLACE TEMP TABLE Prompts AS
WITH
Input AS (
SELECT id, TO_JSON_STRING(I) AS properties
FROM `[DATASET]`.Output AS O
INNER JOIN `[DATASET]`.InputProcessing AS I USING (id)
WHERE (PARTS IS NULL OR ABS(MOD(FARM_FINGERPRINT(id), PARTS)) = PART)
AND IF(IDS IS NOT NULL,
O.id IN UNNEST(IDS),
O.description IS NULL AND O.tries < 3)
ORDER BY RAND()
LIMIT 600 -- TODO: Find out how to use a parameter ITEMS_PER_ITERATION here.
),
Numbered AS (
SELECT id, properties, ROW_NUMBER() OVER (ORDER BY id) - 1 AS row_id
FROM Input
)
SELECT
DIV(row_id, ITEMS_PER_PROMPT) AS chunk_id,
`[DATASET]`.DescriptionsPrompt(LANGUAGE, EXAMPLES, ARRAY_AGG(properties ORDER BY id)) AS prompt,
ARRAY_AGG(id ORDER BY id) AS ids
FROM Numbered
GROUP BY 1;
-- Generate descriptions
CREATE OR REPLACE TEMP TABLE Generated AS
SELECT ids, COALESCE(SPLIT(ml_generate_text_llm_result, '\n'), ids) AS output,
FROM
ML.GENERATE_TEXT(
MODEL `[DATASET]`.GeminiFlash,
TABLE Prompts,
STRUCT(
0.1 AS temperature,
2048 AS max_output_tokens,
TRUE AS flatten_json_output));
-- Store generated descriptions in output feed
MERGE `[DATASET]`.Output AS O
USING (
SELECT
COALESCE(REGEXP_EXTRACT(output, r'^([^:]+): .*'), REGEXP_EXTRACT(output, r'^([^:]+)$')) AS id,
REGEXP_EXTRACT(output, r'^[^:]+: (.*)$') AS description
FROM Generated AS G
CROSS JOIN G.output
QUALIFY ROW_NUMBER() OVER (PARTITION BY id) = 1 AND id IN UNNEST(G.ids)
) AS G
ON O.id = G.id
WHEN MATCHED THEN UPDATE SET
O.description = IFNULL(G.description, O.description),
O.tries = O.tries + 1;
IF IDS IS NOT NULL THEN LEAVE;
END IF;
END LOOP;
END;