Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(database): Make zlib-compressed values more explicit #4167

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
9dd067b
wip(database): `ZLibCompressed`-types
whisperity Feb 14, 2024
d1c5817
fix: Generate the tag properly
whisperity Feb 14, 2024
d8c50cc
chore(database): Unit tests for `ZLibCompressed`
whisperity Feb 15, 2024
669fb8c
feat(migration): Infrastructure skeleton for progress logging during …
whisperity Feb 15, 2024
c8fbf19
feat(migration): Upgrade & Downgrade logic for `zlib` columns
whisperity Feb 16, 2024
aff4d23
feat(migration): Per record generic migration wrapper
whisperity Feb 16, 2024
a0e9568
feat(migration): `analysis_info`.`analyzer_command`
whisperity Feb 16, 2024
c292220
chore: PyLint...
whisperity Feb 16, 2024
867064f
chore: PyCodeStyle...
whisperity Feb 16, 2024
4eea972
fix: Exception format, ctor is not `LOG()`...
whisperity Feb 19, 2024
fa99f19
refactor: `analyzer_statistics` (`version`, `failed_files`)
whisperity Feb 19, 2024
9452e6e
chore: Type-annotate DB model entity constructors for changed fields
whisperity Feb 19, 2024
f239e94
refactor: `file_contents` (`content`, `blame_info`)
whisperity Feb 19, 2024
7e2719f
chore: PyCodeStyle...
whisperity Feb 19, 2024
fe85da1
fix: Do NOT convert `None` into `"null"` first for DBMS-`NULLABLE` co…
whisperity Feb 20, 2024
bee99bc
fix: Do not keep `config` DB connection alive throughout the migration
whisperity Feb 21, 2024
e4dd52e
fix: Do not do multiple rounds of compression to find the exact ratio…
whisperity Feb 22, 2024
20704ac
chore: PyLint...
whisperity Feb 22, 2024
455dedc
refactor: Try to speed the queries a bit, and add profiling printouts
whisperity Feb 22, 2024
f72b5c7
chore: Do not query `ID` column, we already know that value...
whisperity Feb 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
37 changes: 34 additions & 3 deletions codechecker_common/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,17 @@
"""
Util module.
"""


import itertools
import json
from typing import TextIO
from math import ceil
from typing import Callable, TextIO
import os

import portalocker

from codechecker_common.logger import get_logger


LOG = get_logger('system')


Expand Down Expand Up @@ -106,3 +107,33 @@ def path_for_fake_root(full_path: str, root_path: str = '/') -> str:
relative_path = os.path.relpath(full_path, '/')
fake_root_path = os.path.join(root_path, relative_path)
return os.path.realpath(fake_root_path)


def progress(g, count: int, n: int,
callback: Callable[[int, float], None]):
"""
Wraps a generator of a known total length and fires 'callback' after having
yielded every (T/N)th element. The 'callback' is given the index of the
element handled just before firing it, and the percentage of progress.
"""
# E.g., if count == 100 and n == 5, then becomes [100, 95, ..., 10, 5, 0].
try:
checkpoints = [count] + list(reversed(
[list(chk)[0]
for chk in chunks(
range(0, count + 1),
int(ceil(count / n))
)]))
if checkpoints[-1] == 0:
checkpoints.pop()
except ValueError:
# The range is too small to have (count / n) many slices.
checkpoints = [count]

i = 0
for e in g:
i = i + 1
yield e
if i == checkpoints[-1]:
callback(i, float(i) / count * 100)
checkpoints.pop()
48 changes: 17 additions & 31 deletions web/server/codechecker_server/api/mass_store_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
# -------------------------------------------------------------------------

import base64
import json
import os
import sqlalchemy
import tempfile
Expand Down Expand Up @@ -445,18 +444,13 @@ def __add_blame_info(
blame_info, remote_url, tracking_branch = \
get_blame_file_data(blame_file)

compressed_blame_info = None
if blame_info:
compressed_blame_info = zlib.compress(
json.dumps(blame_info).encode('utf-8'),
zlib.Z_BEST_COMPRESSION)

session \
.query(FileContent) \
.filter(FileContent.blame_info.is_(None)) \
.filter(FileContent.content_hash ==
filename_to_hash.get(file_path)) \
.update({"blame_info": compressed_blame_info})
session \
.query(FileContent) \
.filter(FileContent.blame_info.is_(None)) \
.filter(FileContent.content_hash ==
filename_to_hash.get(file_path)) \
.update({"blame_info": blame_info})

session \
.query(File) \
Expand Down Expand Up @@ -507,20 +501,17 @@ def __add_file_content(
if not source_file_content:
source_file_content = get_file_content(source_file_name)
try:
compressed_content = zlib.compress(
source_file_content, zlib.Z_BEST_COMPRESSION)

if session.bind.dialect.name == 'postgresql':
insert_stmt = sqlalchemy.dialects.postgresql \
.insert(FileContent).values(
content_hash=content_hash,
content=compressed_content,
content=source_file_content,
blame_info=None).on_conflict_do_nothing(
index_elements=['content_hash'])

session.execute(insert_stmt)
else:
fc = FileContent(content_hash, compressed_content, None)
fc = FileContent(content_hash, source_file_content, None)
session.add(fc)

session.commit()
Expand Down Expand Up @@ -572,16 +563,12 @@ def __store_analysis_statistics(
for analyzer_type, stat in stats.items():
analyzer_version = None
if stat["versions"]:
analyzer_version = zlib.compress(
"; ".join(stat["versions"]).encode('utf-8'),
zlib.Z_BEST_COMPRESSION)
analyzer_version = "; ".join(stat["versions"])

failed = 0
compressed_files = None
failed_files = None
if stat["failed_sources"]:
compressed_files = zlib.compress(
'\n'.join(stat["failed_sources"]).encode('utf-8'),
zlib.Z_BEST_COMPRESSION)
failed_files = '\n'.join(stat["failed_sources"])

failed = len(stat["failed_sources"])

Expand All @@ -590,7 +577,7 @@ def __store_analysis_statistics(

analyzer_statistics = AnalyzerStatistic(
run_history_id, analyzer_type, analyzer_version,
successful, failed, compressed_files)
successful, failed, failed_files)

session.add(analyzer_statistics)

Expand All @@ -602,13 +589,10 @@ def __store_analysis_info(
""" Store analysis info for the given run history. """
for src_dir_path, mip in self.__mips.items():
for analyzer_command in mip.check_commands:
cmd = zlib.compress(
analyzer_command.encode("utf-8"),
zlib.Z_BEST_COMPRESSION)

analysis_info_rows = session \
.query(AnalysisInfo) \
.filter(AnalysisInfo.analyzer_command == cmd) \
.filter(AnalysisInfo.analyzer_command ==
analyzer_command) \
.all()

if analysis_info_rows:
Expand All @@ -618,7 +602,9 @@ def __store_analysis_info(
# database. In this case we will select the first one.
analysis_info = analysis_info_rows[0]
else:
analysis_info = AnalysisInfo(analyzer_command=cmd)
analysis_info = AnalysisInfo(
analyzer_command=analyzer_command)

session.add(analysis_info)

run_history.analysis_info.append(analysis_info)
Expand Down
44 changes: 16 additions & 28 deletions web/server/codechecker_server/api/report_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

import base64
import html
import json
import os
import re
import shlex
Expand Down Expand Up @@ -1538,11 +1537,9 @@ def getAnalysisInfo(self, analysis_info_filter, limit, offset):
.limit(limit).offset(offset)

for cmd in analysis_info_query:
command = \
zlib.decompress(cmd.analyzer_command).decode('utf-8')

res.append(ttypes.AnalysisInfo(
analyzerCommand=html.escape(command)))
analyzerCommand=html.escape(cmd.analyzer_command)
))

return res

Expand Down Expand Up @@ -2635,13 +2632,10 @@ def getSourceFileData(self, fileId, fileContent, encoding):
trackingBranch=sourcefile.tracking_branch)

if fileContent:
source = zlib.decompress(cont.content)

source = cont.content
if encoding == Encoding.BASE64:
source = base64.b64encode(source)

source_file_data.fileContent = source.decode(
'utf-8', errors='ignore')
source_file_data.fileContent = source.decode(errors="ignore")

return source_file_data

Expand All @@ -2666,10 +2660,6 @@ def getBlameInfo(self, fileId):
return BlameInfo()

try:
blame_info = json.loads(
zlib.decompress(cont.blame_info).decode(
'utf-8', errors='ignore'))

commits = {
commitHash: Commit(
author=CommitAuthor(
Expand All @@ -2679,13 +2669,14 @@ def getBlameInfo(self, fileId):
message=html.escape(commit["message"]),
committedDateTime=commit["committed_datetime"],
)
for commitHash, commit in blame_info["commits"].items()
for commitHash, commit
in cont.blame_info["commits"].items()
}

blame_data = [BlameData(
startLine=b["from"],
endLine=b["to"],
commitHash=b["commit"]) for b in blame_info["blame"]]
commitHash=b["commit"]) for b in cont.blame_info["blame"]]

return BlameInfo(
commits=commits,
Expand All @@ -2705,7 +2696,7 @@ def getLinesInSourceFileContents(self, lines_in_files_requested, encoding):
# This will contain all the lines for the given fileId
contents_to_file_id = defaultdict(list)
# The goal of the chunking is not for achieving better performace
# but to be compatible with SQLITE dbms with larger report counts,
# but to be compatible with SQLite DBMS with larger report counts,
# with larger report data.
for chunk in util.chunks(
lines_in_files_requested, SQLITE_MAX_VARIABLE_NUMBER):
Expand All @@ -2715,14 +2706,14 @@ def getLinesInSourceFileContents(self, lines_in_files_requested, encoding):
FileContent.content_hash == File.content_hash) \
.filter(File.id.in_(
[line.fileId if line.fileId is not None
else LOG.warning(
f"File content "
"requested without fileId {l}")
else LOG.warning("File content requested "
"without a fileId %s",
str(line))
for line in chunk])) \
.all()
for content in contents:
lines = zlib.decompress(
content.content).decode('utf-8', 'ignore').split('\n')
lines = content.content.decode(errors="ignore") \
.split('\n')
contents_to_file_id[content.id] = lines

for files in lines_in_files_requested:
Expand Down Expand Up @@ -3246,8 +3237,6 @@ def getFailedFiles(self, run_ids):
.filter(AnalyzerStatistic.failed_files.isnot(None))

for failed_files, run_name in query.all():
failed_files = zlib.decompress(failed_files).decode('utf-8')

for failed_file in failed_files.split('\n'):
already_exists = \
any(i.runName == run_name for i in res[failed_file])
Expand Down Expand Up @@ -3648,10 +3637,9 @@ def getAnalysisStatistics(self, run_id, run_history_id):
session, run_ids, run_history_ids)

for stat, run_id in query:
failed_files = zlib.decompress(stat.failed_files).decode(
'utf-8').split('\n') if stat.failed_files else []
analyzer_version = zlib.decompress(
stat.version).decode('utf-8') if stat.version else None
failed_files = stat.failed_file.split('\n') \
if stat.failed_files else []
analyzer_version = stat.version if stat.version else None

analyzer_statistics[stat.analyzer_type] = \
ttypes.AnalyzerStatistics(version=analyzer_version,
Expand Down
13 changes: 7 additions & 6 deletions web/server/codechecker_server/cmd/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,11 @@ def __db_migration(cfg_sql_server, migration_root, environ,

product = sess.query(ORMProduct).filter(
ORMProduct.endpoint == prod).first()
db = database.SQLServer.from_connection_string(product.connection,
connection, endpoint = product.connection, product.endpoint
sess.close()
engine.dispose()

db = database.SQLServer.from_connection_string(connection,
RUN_META,
migration_root,
interactive=False,
Expand All @@ -592,7 +596,7 @@ def __db_migration(cfg_sql_server, migration_root, environ,
LOG.info(msg)
if db_status == DBStatus.SCHEMA_MISSING:
question = 'Do you want to initialize a new schema for ' \
+ product.endpoint + '? Y(es)/n(o) '
+ endpoint + '? Y(es)/n(o) '
if force_upgrade or env.get_user_input(question):
ret = db.connect(init=True)
msg = database_status.db_status_msg.get(
Expand All @@ -603,7 +607,7 @@ def __db_migration(cfg_sql_server, migration_root, environ,

elif db_status == DBStatus.SCHEMA_MISMATCH_OK:
question = 'Do you want to upgrade to new schema for ' \
+ product.endpoint + '? Y(es)/n(o) '
+ endpoint + '? Y(es)/n(o) '
if force_upgrade or env.get_user_input(question):
LOG.info("Upgrading schema ...")
ret = db.upgrade()
Expand All @@ -614,9 +618,6 @@ def __db_migration(cfg_sql_server, migration_root, environ,
else:
LOG.info("No schema migration was done.")

sess.commit()
sess.close()
engine.dispose()
LOG.info("========================")
return 0

Expand Down
10 changes: 9 additions & 1 deletion web/server/codechecker_server/database/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,14 @@ def _create_schema(self):
return True

except sqlalchemy.exc.SQLAlchemyError as alch_err:
import traceback
traceback.print_exc()
LOG.error(str(alch_err))
return False

except Exception as ex:
import traceback
traceback.print_exc()
LOG.error("Failed to create initial database schema")
LOG.error(ex)
return False
Expand Down Expand Up @@ -328,11 +332,15 @@ def upgrade(self):
return DBStatus.OK

except sqlalchemy.exc.SQLAlchemyError as alch_err:
import traceback
traceback.print_exc()
LOG.error(str(alch_err))
return DBStatus.SCHEMA_UPGRADE_FAILED

except CommandError as cerr:
LOG.debug(str(cerr))
import traceback
traceback.print_exc()
LOG.error(str(cerr))
return DBStatus.SCHEMA_UPGRADE_FAILED

@abstractmethod
Expand Down