Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added TD-xxx-try-copilot-self-audit
Empty file.
1 change: 1 addition & 0 deletions include/common/tglobal.h
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ extern int32_t tsAuditLevel;
extern int32_t tsAuditInterval;
extern bool tsAuditHttps;
extern bool tsAuditUseToken;
extern bool tsAuditLocalWrite; // Enable direct write to local cluster

// telem
extern bool tsEnableTelem;
Expand Down
7 changes: 7 additions & 0 deletions source/common/src/tglobal.c
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ int32_t tsAuditInterval = 5000;
int32_t tsAuditLevel = AUDIT_LEVEL_DATABASE;
bool tsAuditHttps = false;
bool tsAuditUseToken = true;
bool tsAuditLocalWrite = true; // Enable direct write to local cluster by default in enterprise
#else
bool tsEnableAudit = false;
bool tsEnableAuditCreateTable = false;
Expand All @@ -251,6 +252,7 @@ int32_t tsAuditInterval = 200000;
int32_t tsAuditLevel = AUDIT_LEVEL_NONE;
bool tsAuditHttps = false;
bool tsAuditUseToken = true;
bool tsAuditLocalWrite = false; // Disable in community edition
#endif

// telem
Expand Down Expand Up @@ -1041,6 +1043,7 @@ static int32_t taosAddServerCfg(SConfig *pCfg) {
TAOS_CHECK_RETURN(cfgAddInt32(pCfg, "auditInterval", tsAuditInterval, 500, 200000, CFG_SCOPE_SERVER, CFG_DYN_SERVER,CFG_CATEGORY_GLOBAL, CFG_PRIV_AUDIT));
TAOS_CHECK_RETURN(cfgAddBool(pCfg, "auditHttps", tsAuditHttps, CFG_SCOPE_SERVER, CFG_DYN_ENT_SERVER,CFG_CATEGORY_GLOBAL, CFG_PRIV_AUDIT));
TAOS_CHECK_RETURN(cfgAddBool(pCfg, "auditUseToken", tsAuditUseToken, CFG_SCOPE_SERVER, CFG_DYN_ENT_SERVER,CFG_CATEGORY_GLOBAL, CFG_PRIV_AUDIT));
TAOS_CHECK_RETURN(cfgAddBool(pCfg, "auditLocalWrite", tsAuditLocalWrite, CFG_SCOPE_SERVER, CFG_DYN_ENT_SERVER,CFG_CATEGORY_GLOBAL, CFG_PRIV_AUDIT));
TAOS_CHECK_RETURN(cfgAddBool(pCfg, "telemetryReporting", tsEnableTelem, CFG_SCOPE_SERVER, CFG_DYN_ENT_SERVER,CFG_CATEGORY_GLOBAL, CFG_PRIV_SYSTEM));
TAOS_CHECK_RETURN(cfgAddInt32(pCfg, "telemetryInterval", tsTelemInterval, 1, 200000, CFG_SCOPE_SERVER, CFG_DYN_SERVER,CFG_CATEGORY_GLOBAL, CFG_PRIV_SYSTEM));
TAOS_CHECK_RETURN(cfgAddString(pCfg, "telemetryServer", tsTelemServer, CFG_SCOPE_SERVER, CFG_DYN_BOTH,CFG_CATEGORY_GLOBAL, CFG_PRIV_SYSTEM));
Expand Down Expand Up @@ -1883,6 +1886,9 @@ static int32_t taosSetServerCfg(SConfig *pCfg) {
TAOS_CHECK_GET_CFG_ITEM(pCfg, pItem, "auditUseToken");
tsAuditUseToken = pItem->bval;

TAOS_CHECK_GET_CFG_ITEM(pCfg, pItem, "auditLocalWrite");
tsAuditLocalWrite = pItem->bval;

TAOS_CHECK_GET_CFG_ITEM(pCfg, pItem, "auditInterval");
tsAuditInterval = pItem->i32;
#endif
Expand Down Expand Up @@ -2924,6 +2930,7 @@ static int32_t taosCfgDynamicOptionsForServer(SConfig *pCfg, const char *name) {
{"auditLevel", &tsAuditLevel},
{"auditHttps", &tsAuditHttps},
{"auditUseToken", &tsAuditUseToken},
{"auditLocalWrite", &tsAuditLocalWrite},
{"slowLogThreshold", &tsSlowLogThreshold},
{"compressMsgSize", &tsCompressMsgSize},
{"compressor", &tsCompressor},
Expand Down
6 changes: 6 additions & 0 deletions source/libs/audit/inc/auditInt.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ typedef struct {
TdThreadRwlock infoLock;
char auditDB[TSDB_DB_FNAME_LEN];
char auditToken[TSDB_TOKEN_LEN];
void *pLocalConn; // Local TAOS connection for direct write
TdThreadMutex connLock; // Lock for local connection
TdThread preConnThread;
int8_t preConnThreadCreated;
int8_t preConnThreadRunning;
int8_t stopPreConnThread;
} SAudit;

#endif /*_TD_AUDIT_INT_H_*/
42 changes: 37 additions & 5 deletions source/libs/audit/src/auditMain.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,24 @@

#define _DEFAULT_SOURCE

#include "tarray.h"
#include "audit.h"
#include "auditInt.h"
#include "osMemory.h"
#include "taos.h"
#include "taoserror.h"
#include "tarray.h"
#include "tglobal.h"
#include "thttp.h"
#include "ttime.h"
#include "tjson.h"
#include "tglobal.h"
#include "audit.h"
#include "osMemory.h"
#include "ttime.h"

SAudit tsAudit = {0};
char* tsAuditUri = "/audit_v2";
char* tsAuditBatchUri = "/audit-batch";

extern int32_t auditPreconnectLocal();
extern void auditStopPreconnectLocal();

static FORCE_INLINE void auditDeleteRecord(SAuditRecord *record) {
if (record) {
taosMemoryFree(record->detail);
Expand All @@ -50,13 +54,37 @@ int32_t auditInit(const SAuditCfg *pCfg) {
taosArrayDestroyP(tsAudit.records, (FDelete)auditDeleteRecord);
return -1;
}
if (taosThreadMutexInit(&tsAudit.connLock, NULL) != 0) {
(void)taosThreadMutexDestroy(&tsAudit.recordLock);
(void)taosThreadRwlockDestroy(&tsAudit.infoLock);
taosArrayDestroyP(tsAudit.records, (FDelete)auditDeleteRecord);
return -1;
Comment on lines +57 to +61
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

In the error path of auditInit, after destroying the records array, it is important to set the pointer to NULL. This prevents potential double-free or use-after-free issues if auditCleanup is called later (which is common during dnode shutdown even if initialization partially failed).

  if (taosThreadMutexInit(&tsAudit.connLock, NULL) != 0) {
    (void)taosThreadMutexDestroy(&tsAudit.recordLock);
    (void)taosThreadRwlockDestroy(&tsAudit.infoLock);
    taosArrayDestroyP(tsAudit.records, (FDelete)auditDeleteRecord);
    tsAudit.records = NULL;
    return -1;

}

// Start non-blocking preconnect in background so startup and RPC threads never wait on taos_connect.
if (auditPreconnectLocal() != 0) {
Comment on lines +64 to +65
Copy link

Copilot AI Apr 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new auditLocalWrite config is added, but auditInit() always starts the local preconnect logic regardless of whether local write is enabled. This can start an unnecessary background thread/work on deployments that have auditLocalWrite=0. Gate the auditPreconnectLocal() call behind the runtime config (e.g., tsAuditLocalWrite) so the feature is truly opt-in at runtime.

Suggested change
// Start non-blocking preconnect in background so startup and RPC threads never wait on taos_connect.
if (auditPreconnectLocal() != 0) {
// Start non-blocking preconnect in background only when local audit write is enabled,
// so startup and RPC threads never wait on taos_connect for an opt-in feature.
if (tsAuditLocalWrite && auditPreconnectLocal() != 0) {

Copilot uses AI. Check for mistakes.
uWarn("failed to start local TDengine preconnect thread, will retry on demand");
}

return 0;
}

void auditSetDnodeId(int32_t dnodeId) { tsAudit.dnodeId = dnodeId; }

void auditCleanup() {
tsLogFp = NULL;

auditStopPreconnectLocal();

// Close local connection
(void)taosThreadMutexLock(&tsAudit.connLock);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Potential undefined behavior: if auditInit fails before tsAudit.connLock is initialized (e.g., if taosArrayInit or taosThreadRwlockInit fails), tsAudit.connLock remains zero-initialized. Calling taosThreadMutexLock on an uninitialized mutex is unsafe. Consider adding a check to verify that the audit system was successfully initialized before attempting to lock and destroy its mutexes.

if (tsAudit.pLocalConn != NULL) {
taos_close(tsAudit.pLocalConn);
tsAudit.pLocalConn = NULL;
}
(void)taosThreadMutexUnlock(&tsAudit.connLock);
(void)taosThreadMutexDestroy(&tsAudit.connLock);
Comment on lines +79 to +86
Copy link

Copilot AI Apr 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

auditCleanup() unconditionally locks/destroys tsAudit.connLock. If auditInit() returns early before connLock is initialized (e.g., records/locks init failure), cleanup can still be invoked during env teardown and this becomes undefined behavior. Track whether connLock was successfully initialized (and whether pLocalConn was ever created) and only lock/destroy/close when initialized; alternatively initialize connLock in a way that makes cleanup always safe.

Copilot uses AI. Check for mistakes.

(void)taosThreadMutexLock(&tsAudit.recordLock);
taosArrayDestroyP(tsAudit.records, (FDelete)auditDeleteRecord);
(void)taosThreadMutexUnlock(&tsAudit.recordLock);
Expand Down Expand Up @@ -93,6 +121,10 @@ void auditRecordImp(SRpcMsg *pReq, int64_t clusterId, char *operation, char *tar
void auditAddRecordImp(SRpcMsg *pReq, int64_t clusterId, char *operation, char *target1, char *target2, char *detail,
int32_t len, double duration, int64_t affectedRows) {}

int32_t auditPreconnectLocal() { return 0; }

void auditStopPreconnectLocal() {}

void auditSendRecordsInBatchImp(){

}
Expand Down
64 changes: 64 additions & 0 deletions tests/script/sh/stop_dnodes.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/bin/sh

set +e
#set -x

unset LD_PRELOAD
UNAME_BIN=`which uname`
OS_TYPE=`$UNAME_BIN`

psby() {
if [ "$OS_TYPE" != "Darwin" ]; then
ps -C $1
else
ps a -c
fi
}

PID=`ps -efww | grep /usr/bin/taosd | grep -v grep | awk '{print $2}'`
if [ -n "$PID" ]; then
echo systemctl stop taosd
systemctl stop taosd
fi

PID=`psby taosd | grep -w "[t]aosd" | awk '{print $1}' | head -n 1`
while [ -n "$PID" ]; do
echo kill -9 $PID
#pkill -9 taosd
kill -9 $PID
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using kill -9 (SIGKILL) as the primary method to stop processes is generally discouraged as it prevents graceful shutdown (e.g., flushing WAL, closing files). It is better to send a SIGTERM first, wait for a few seconds, and then use SIGKILL only if the process is still alive. This applies to lines 28, 42, and 56.

echo "Killing taosd processes"
if [ "$OS_TYPE" != "Darwin" ]; then
fuser -k -n tcp 6030
else
lsof -nti:6030 | xargs kill -9
fi
Comment on lines +31 to +34
Copy link

Copilot AI Apr 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On macOS, lsof -nti:6030 | xargs kill -9 may invoke kill with no arguments when nothing is listening on the port, producing errors/noise. Guard the pipeline (e.g., only run kill when PIDs are non-empty) to keep the stop script quiet and predictable.

Copilot uses AI. Check for mistakes.
PID=`psby taosd | grep -w "[t]aosd" | awk '{print $1}' | head -n 1`
done

PID=`psby taos | grep -w "[t]aos" | awk '{print $1}' | head -n 1`
while [ -n "$PID" ]; do
echo kill -9 $PID
#pkill -9 taos
kill -9 $PID
echo "Killing taos processes"
if [ "$OS_TYPE" != "Darwin" ]; then
fuser -k -n tcp 6030
else
lsof -nti:6030 | xargs kill -9
fi
PID=`psby taos | grep -w "[t]aos" | awk '{print $1}' | head -n 1`
done

PID=`psby tmq_sim | grep -w "[t]mq_sim" | awk '{print $1}' | head -n 1`
while [ -n "$PID" ]; do
echo kill -9 $PID
#pkill -9 tmq_sim
kill -9 $PID
echo "Killing tmq_sim processes"
if [ "$OS_TYPE" != "Darwin" ]; then
fuser -k -n tcp 6030
else
lsof -nti:6030 | xargs kill -9
fi
PID=`psby tmq_sim | grep -w "[t]mq_sim" | awk '{print $1}' | head -n 1`
done
Comment on lines +18 to +64
Copy link

Copilot AI Apr 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These while [ -n "$PID" ] loops can spin forever if kill -9 $PID fails (e.g., insufficient permissions or PID reused between iterations). Add a bounded retry counter / timeout, and consider using pkill with process name plus a final check, so the script exits deterministically even when processes can’t be killed.

Suggested change
PID=`ps -efww | grep /usr/bin/taosd | grep -v grep | awk '{print $2}'`
if [ -n "$PID" ]; then
echo systemctl stop taosd
systemctl stop taosd
fi
PID=`psby taosd | grep -w "[t]aosd" | awk '{print $1}' | head -n 1`
while [ -n "$PID" ]; do
echo kill -9 $PID
#pkill -9 taosd
kill -9 $PID
echo "Killing taosd processes"
if [ "$OS_TYPE" != "Darwin" ]; then
fuser -k -n tcp 6030
else
lsof -nti:6030 | xargs kill -9
fi
PID=`psby taosd | grep -w "[t]aosd" | awk '{print $1}' | head -n 1`
done
PID=`psby taos | grep -w "[t]aos" | awk '{print $1}' | head -n 1`
while [ -n "$PID" ]; do
echo kill -9 $PID
#pkill -9 taos
kill -9 $PID
echo "Killing taos processes"
if [ "$OS_TYPE" != "Darwin" ]; then
fuser -k -n tcp 6030
else
lsof -nti:6030 | xargs kill -9
fi
PID=`psby taos | grep -w "[t]aos" | awk '{print $1}' | head -n 1`
done
PID=`psby tmq_sim | grep -w "[t]mq_sim" | awk '{print $1}' | head -n 1`
while [ -n "$PID" ]; do
echo kill -9 $PID
#pkill -9 tmq_sim
kill -9 $PID
echo "Killing tmq_sim processes"
if [ "$OS_TYPE" != "Darwin" ]; then
fuser -k -n tcp 6030
else
lsof -nti:6030 | xargs kill -9
fi
PID=`psby tmq_sim | grep -w "[t]mq_sim" | awk '{print $1}' | head -n 1`
done
kill_processes_with_retry() {
PROC_NAME="$1"
GREP_PATTERN="$2"
MAX_RETRIES=10
RETRY_COUNT=0
PID=`psby "$PROC_NAME" | grep -w "$GREP_PATTERN" | awk '{print $1}' | head -n 1`
while [ -n "$PID" ] && [ "$RETRY_COUNT" -lt "$MAX_RETRIES" ]; do
echo kill -9 "$PID"
kill -9 "$PID"
if command -v pkill >/dev/null 2>&1; then
pkill -9 "$PROC_NAME"
fi
echo "Killing $PROC_NAME processes"
if [ "$OS_TYPE" != "Darwin" ]; then
fuser -k -n tcp 6030
else
lsof -nti:6030 | xargs kill -9
fi
RETRY_COUNT=`expr "$RETRY_COUNT" + 1`
sleep 1
PID=`psby "$PROC_NAME" | grep -w "$GREP_PATTERN" | awk '{print $1}' | head -n 1`
done
if [ -n "$PID" ]; then
echo "Failed to kill $PROC_NAME processes after $MAX_RETRIES attempts; last PID: $PID"
fi
}
PID=`ps -efww | grep /usr/bin/taosd | grep -v grep | awk '{print $2}'`
if [ -n "$PID" ]; then
echo systemctl stop taosd
systemctl stop taosd
fi
kill_processes_with_retry taosd "[t]aosd"
kill_processes_with_retry taos "[t]aos"
kill_processes_with_retry tmq_sim "[t]mq_sim"

Copilot uses AI. Check for mistakes.
59 changes: 59 additions & 0 deletions tests/system-test/0-others/audit_local_write.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from util.log import *
from util.cases import *
from util.sql import *

import time

from ..common.basic import BasicFun


class TDTestCase:
def init(self, conn, logSql, replicaVar=1):
tdLog.debug(f"start to execute {__file__}")
self.replicaVar = int(replicaVar)
self.Fun = BasicFun()
Comment on lines +4 to +14
Copy link

Copilot AI Apr 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test uses a relative import (from ..common.basic import BasicFun), but tests/system-test is not a Python package (no __init__.py files) and there is no tests/system-test/common/ module in the repo, so the test will fail to import/run. Use the existing system-test utilities via absolute imports (e.g., util.dnodes, tdSql.init(conn.cursor(), ...)) or add the missing package/modules + __init__.py files if relative imports are required.

Suggested change
import time
from ..common.basic import BasicFun
class TDTestCase:
def init(self, conn, logSql, replicaVar=1):
tdLog.debug(f"start to execute {__file__}")
self.replicaVar = int(replicaVar)
self.Fun = BasicFun()
from util.dnodes import *
import time
class BasicFunCompat:
def __init__(self, conn, logSql):
self.conn = conn
self.logSql = logSql
self.TDDnodes = tdDnodes
self.dnodes_count = 1
def config_cluster(self, dnodes_count):
self.dnodes_count = int(dnodes_count)
if hasattr(self.TDDnodes, "init"):
self.TDDnodes.init(self.dnodes_count)
def deploy_start_cluster(self):
if hasattr(self.TDDnodes, "deploy"):
self.TDDnodes.deploy(self.dnodes_count)
if hasattr(self.TDDnodes, "start"):
self.TDDnodes.start(self.dnodes_count)
def connect(self):
tdSql.init(self.conn.cursor(), self.logSql)
class TDTestCase:
def init(self, conn, logSql, replicaVar=1):
tdLog.debug(f"start to execute {__file__}")
self.replicaVar = int(replicaVar)
self.conn = conn
self.logSql = logSql
tdSql.init(conn.cursor(), logSql)
self.Fun = BasicFunCompat(conn, logSql)

Copilot uses AI. Check for mistakes.

def run(self):
self.Fun.config_cluster(1)

for dnode in self.Fun.TDDnodes.dnodes:
dnode.addExtraCfg("audit", "1")
dnode.addExtraCfg("auditInterval", "500")
dnode.addExtraCfg("auditLevel", "5")
dnode.addExtraCfg("enableAuditDelete", "1")
dnode.addExtraCfg("enableAuditSelect", "1")
dnode.addExtraCfg("enableAuditInsert", "1")
dnode.addExtraCfg("auditLocalWrite", "1")
dnode.addExtraCfg("auditUseToken", "0")
dnode.addExtraCfg("monitorFqdn", "127.0.0.1")
dnode.addExtraCfg("monitorPort", "1")

self.Fun.deploy_start_cluster()
self.Fun.connect()

tdSql.execute("drop database if exists audit_case")
tdSql.execute("create database audit_case")
tdSql.execute("use audit_case")
tdSql.execute("create table if not exists t(ts timestamp, c1 int)")
tdSql.execute("insert into t values(now, 1)")
tdSql.query("select * from t")
tdSql.checkRows(1)

# auditRecord is sync for these statements, but wait briefly for robustness.
count = 0
for _ in range(20):
tdSql.query("select count(*) from audit.audit_events")
count = tdSql.queryResult[0][0]
if count > 0:
break
time.sleep(1)

tdSql.checkGreater(count, 0)

def stop(self):
tdSql.close()
tdLog.success(f"{__file__} successfully executed")


tdCases.addLinux(__file__, TDTestCase())
tdCases.addWindows(__file__, TDTestCase())
Loading